diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 484b947bda402..dd4116fa16bc5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -131,7 +131,7 @@ /bolt/ @aaupov @maksfb @rafaelauler @ayermolo @dcci @yota9 # Bazel build system. -/utils/bazel/ @rupprecht @keith +/utils/bazel/ @rupprecht @keith @aaronmondal # InstallAPI and TextAPI /llvm/**/TextAPI/ @cyndyishida diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml index d18b9b0b5c2ff..fd4694ebea32d 100644 --- a/.github/workflows/release-binaries-all.yml +++ b/.github/workflows/release-binaries-all.yml @@ -27,6 +27,10 @@ on: required: true default: false type: boolean + secrets: + RELEASE_TASKS_USER_TOKEN: + description: "Secret used to check user permissions." + required: false pull_request: types: diff --git a/bolt/include/bolt/Core/BinarySection.h b/bolt/include/bolt/Core/BinarySection.h index 1093f6ad78a99..9c252c3675951 100644 --- a/bolt/include/bolt/Core/BinarySection.h +++ b/bolt/include/bolt/Core/BinarySection.h @@ -359,15 +359,9 @@ class BinarySection { /// Add a new relocation at the given /p Offset. void addRelocation(uint64_t Offset, MCSymbol *Symbol, uint64_t Type, - uint64_t Addend, uint64_t Value = 0, - bool Pending = false) { + uint64_t Addend, uint64_t Value = 0) { assert(Offset < getSize() && "offset not within section bounds"); - if (!Pending) { - Relocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value}); - } else { - PendingRelocations.emplace_back( - Relocation{Offset, Symbol, Type, Addend, Value}); - } + Relocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value}); } /// Add a dynamic relocation at the given /p Offset. diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp index efa06cd68cb99..f151cf5f63fc5 100644 --- a/bolt/tools/driver/llvm-bolt.cpp +++ b/bolt/tools/driver/llvm-bolt.cpp @@ -173,16 +173,6 @@ void boltMode(int argc, char **argv) { } } -static std::string GetExecutablePath(const char *Argv0) { - SmallString<256> ExecutablePath(Argv0); - // Do a PATH lookup if Argv0 isn't a valid path. - if (!llvm::sys::fs::exists(ExecutablePath)) - if (llvm::ErrorOr P = - llvm::sys::findProgramByName(ExecutablePath)) - ExecutablePath = *P; - return std::string(ExecutablePath); -} - int main(int argc, char **argv) { // Print a stack trace if we signal out. sys::PrintStackTraceOnErrorSignal(argv[0]); @@ -190,7 +180,7 @@ int main(int argc, char **argv) { llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. - std::string ToolPath = GetExecutablePath(argv[0]); + std::string ToolPath = llvm::sys::fs::getMainExecutable(argv[0], nullptr); // Initialize targets and assembly printers/parsers. llvm::InitializeAllTargetInfos(); diff --git a/bolt/unittests/Core/BinaryContext.cpp b/bolt/unittests/Core/BinaryContext.cpp index 9819a8c2b777b..0fefa1b83c3c2 100644 --- a/bolt/unittests/Core/BinaryContext.cpp +++ b/bolt/unittests/Core/BinaryContext.cpp @@ -93,12 +93,13 @@ TEST_P(BinaryContextTester, FlushPendingRelocCALL26) { DataSize, 4); MCSymbol *RelSymbol1 = BC->getOrCreateGlobalSymbol(4, "Func1"); ASSERT_TRUE(RelSymbol1); - BS.addRelocation(8, RelSymbol1, ELF::R_AARCH64_CALL26, 0, 0, true); + BS.addPendingRelocation( + Relocation{8, RelSymbol1, ELF::R_AARCH64_CALL26, 0, 0}); MCSymbol *RelSymbol2 = BC->getOrCreateGlobalSymbol(16, "Func2"); ASSERT_TRUE(RelSymbol2); - BS.addRelocation(12, RelSymbol2, ELF::R_AARCH64_CALL26, 0, 0, true); + BS.addPendingRelocation( + Relocation{12, RelSymbol2, ELF::R_AARCH64_CALL26, 0, 0}); - std::error_code EC; SmallVector Vect(DataSize); raw_svector_ostream OS(Vect); @@ -134,12 +135,13 @@ TEST_P(BinaryContextTester, FlushPendingRelocJUMP26) { (uint8_t *)Data, Size, 4); MCSymbol *RelSymbol1 = BC->getOrCreateGlobalSymbol(4, "Func1"); ASSERT_TRUE(RelSymbol1); - BS.addRelocation(8, RelSymbol1, ELF::R_AARCH64_JUMP26, 0, 0, true); + BS.addPendingRelocation( + Relocation{8, RelSymbol1, ELF::R_AARCH64_JUMP26, 0, 0}); MCSymbol *RelSymbol2 = BC->getOrCreateGlobalSymbol(16, "Func2"); ASSERT_TRUE(RelSymbol2); - BS.addRelocation(12, RelSymbol2, ELF::R_AARCH64_JUMP26, 0, 0, true); + BS.addPendingRelocation( + Relocation{12, RelSymbol2, ELF::R_AARCH64_JUMP26, 0, 0}); - std::error_code EC; SmallVector Vect(Size); raw_svector_ostream OS(Vect); diff --git a/clang/docs/HLSL/FunctionCalls.rst b/clang/docs/HLSL/FunctionCalls.rst index 6d65fe6e3fb20..ea6dc2ad8a4df 100644 --- a/clang/docs/HLSL/FunctionCalls.rst +++ b/clang/docs/HLSL/FunctionCalls.rst @@ -248,13 +248,14 @@ which is a term made up for HLSL. A cx-value is a temporary value which may be the result of a cast, and stores its value back to an lvalue when the value expires. -To represent this concept in Clang we introduce a new ``HLSLOutParamExpr``. An -``HLSLOutParamExpr`` has two forms, one with a single sub-expression and one -with two sub-expressions. +To represent this concept in Clang we introduce a new ``HLSLOutArgExpr``. An +``HLSLOutArgExpr`` has three sub-expressions: -The single sub-expression form is used when the argument expression and the -function parameter are the same type, so no cast is required. As in this -example: +* An OpaqueValueExpr of the argument lvalue expression. +* An OpaqueValueExpr of the copy-initialized parameter temporary. +* A BinaryOpExpr assigning the first with the value of the second. + +Given this example: .. code-block:: c++ @@ -267,23 +268,36 @@ example: Init(V); } -The expected AST formulation for this code would be something like: +The expected AST formulation for this code would be something like the example +below. Due to the nature of OpaqueValueExpr nodes, the nodes repeat in the AST +dump. The fake addresses ``0xSOURCE`` and ``0xTEMPORARY`` denote the source +lvalue and argument temporary lvalue expressions. .. code-block:: text CallExpr 'void' |-ImplicitCastExpr 'void (*)(int &)' | `-DeclRefExpr 'void (int &)' lvalue Function 'Init' 'void (int &)' - |-HLSLOutParamExpr 'int' lvalue inout - `-DeclRefExpr 'int' lvalue Var 'V' 'int' - -The ``HLSLOutParamExpr`` captures that the value is ``inout`` vs ``out`` to -denote whether or not the temporary is initialized from the sub-expression. If -no casting is required the sub-expression denotes the lvalue expression that the -cx-value will be copied to when the value expires. - -The two sub-expression form of the AST node is required when the argument type -is not the same as the parameter type. Given this example: + `-HLSLOutArgExpr 'int' lvalue inout + |-OpaqueValueExpr 0xSOURCE 'int' lvalue + | `-DeclRefExpr 'int' lvalue Var 'V' 'int' + |-OpaqueValueExpr 0xTEMPORARY 'int' lvalue + | `-ImplicitCastExpr 'int' + | `-OpaqueValueExpr 0xSOURCE 'int' lvalue + | `-DeclRefExpr 'int' lvalue Var 'V' 'int' + `-BinaryOperator 'int' lvalue '=' + |-OpaqueValueExpr 0xSOURCE 'int' lvalue + | `-DeclRefExpr 'int' lvalue Var 'V' 'int' + `-ImplicitCastExpr 'int' + `-OpaqueValueExpr 0xTEMPORARY 'int' lvalue + `-ImplicitCastExpr 'int' + `-OpaqueValueExpr 0xSOURCE 'int' lvalue + `-DeclRefExpr 'int' lvalue Var 'V' 'int' + +The ``HLSLOutArgExpr`` captures that the value is ``inout`` vs ``out`` to +denote whether or not the temporary is initialized from the sub-expression. + +The example below demonstrates argument casting: .. code-block:: c++ @@ -295,7 +309,7 @@ is not the same as the parameter type. Given this example: Trunc(F); } -For this case the ``HLSLOutParamExpr`` will have sub-expressions to record both +For this case the ``HLSLOutArgExpr`` will have sub-expressions to record both casting expression sequences for the initialization and write back: .. code-block:: text @@ -303,20 +317,31 @@ casting expression sequences for the initialization and write back: -CallExpr 'void' |-ImplicitCastExpr 'void (*)(int3 &)' | `-DeclRefExpr 'void (int3 &)' lvalue Function 'inc_i32' 'void (int3 &)' - `-HLSLOutParamExpr 'int3' lvalue inout - |-ImplicitCastExpr 'float3' - | `-ImplicitCastExpr 'int3' - | `-OpaqueValueExpr 'int3' lvalue - `-ImplicitCastExpr 'int3' - `-ImplicitCastExpr 'float3' - `-DeclRefExpr 'float3' lvalue 'F' 'float3' - -In this formation the write-back casts are captured as the first sub-expression -and they cast from an ``OpaqueValueExpr``. In IR generation we can use the -``OpaqueValueExpr`` as a placeholder for the ``HLSLOutParamExpr``'s temporary -value on function return. - -In code generation this can be implemented with some targeted extensions to the -Objective-C write-back support. Specifically extending CGCall.cpp's -``EmitWriteback`` function to support casting expressions and emission of -aggregate lvalues. + `-HLSLOutArgExpr 'int3':'vector' lvalue inout + |-OpaqueValueExpr 0xSOURCE 'float3':'vector' lvalue + | `-DeclRefExpr 'float3':'vector' lvalue Var 'F' 'float3':'vector' + |-OpaqueValueExpr 0xTEMPORARY 'int3':'vector' lvalue + | `-ImplicitCastExpr 'vector' + | `-ImplicitCastExpr 'float3':'vector' + | `-OpaqueValueExpr 0xSOURCE 'float3':'vector' lvalue + | `-DeclRefExpr 'float3':'vector' lvalue Var 'F' 'float3':'vector' + `-BinaryOperator 'float3':'vector' lvalue '=' + |-OpaqueValueExpr 0xSOURCE 'float3':'vector' lvalue + | `-DeclRefExpr 'float3':'vector' lvalue Var 'F' 'float3':'vector' + `-ImplicitCastExpr 'vector' + `-ImplicitCastExpr 'int3':'vector' + `-OpaqueValueExpr 0xTEMPORARY 'int3':'vector' lvalue + `-ImplicitCastExpr 'vector' + `-ImplicitCastExpr 'float3':'vector' + `-OpaqueValueExpr 0xSOURCE 'float3':'vector' lvalue + `-DeclRefExpr 'float3':'vector' lvalue Var 'F' 'float3':'vector' + +The AST representation is the same whether casting is required or not, which +simplifies the code generation. IR generation does the following: + +* Emit the argument lvalue expression. +* Initialize the argument: + * For ``inout`` arguments, emit the copy-initialization expression. + * For ``out`` arguments, emit an uninitialized temporary. +* Emit the call +* Emit the write-back BinaryOperator expression. diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 369d9e9de7d16..afd5cdc9c49e1 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -54,6 +54,8 @@ ABI Changes in This Version AST Dumping Potentially Breaking Changes ---------------------------------------- +- Added support for dumping template arguments of structural value kinds. + Clang Frontend Potentially Breaking Changes ------------------------------------------- @@ -104,6 +106,10 @@ Non-comprehensive list of changes in this release New Compiler Flags ------------------ +- New option ``-fprofile-continuous`` added to enable continuous profile syncing to file (#GH124353, `docs `_). + The feature has `existed `_) + for a while and this is just a user facing option. + Deprecated Compiler Flags ------------------------- @@ -141,6 +147,9 @@ Improvements to Coverage Mapping Bug Fixes in This Version ------------------------- +- Clang now outputs correct values when #embed data contains bytes with negative + signed char values (#GH102798). + Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -248,6 +257,9 @@ clang-format libclang -------- +- Fixed a buffer overflow in ``CXString`` implementation. The fix may result in + increased memory allocation. + Code Completion --------------- diff --git a/clang/docs/TypeSanitizer.rst b/clang/docs/TypeSanitizer.rst index 4d1dfc23a6c51..3c683a6c24bb4 100644 --- a/clang/docs/TypeSanitizer.rst +++ b/clang/docs/TypeSanitizer.rst @@ -27,7 +27,7 @@ reduce these impacts. The TypeSanitizer Algorithm =========================== For each TBAA type-access descriptor, encoded in LLVM IR using TBAA Metadata, the instrumentation -pass generates descriptor tales. Thus there is a unique pointer to each type (and access descriptor). +pass generates descriptor tables. Thus there is a unique pointer to each type (and access descriptor). These tables are comdat (except for anonymous-namespace types), so the pointer values are unique across the program. diff --git a/clang/docs/analyzer/developer-docs/PerformanceInvestigation.rst b/clang/docs/analyzer/developer-docs/PerformanceInvestigation.rst index 3ee6e117a8465..ca3a56828209b 100644 --- a/clang/docs/analyzer/developer-docs/PerformanceInvestigation.rst +++ b/clang/docs/analyzer/developer-docs/PerformanceInvestigation.rst @@ -5,6 +5,9 @@ Performance Investigation Multiple factors contribute to the time it takes to analyze a file with Clang Static Analyzer. A translation unit contains multiple entry points, each of which take multiple steps to analyze. +Performance analysis using ``-ftime-trace`` +=========================================== + You can add the ``-ftime-trace=file.json`` option to break down the analysis time into individual entry points and steps within each entry point. You can explore the generated JSON file in a Chromium browser using the ``chrome://tracing`` URL, or using `speedscope `_. @@ -19,9 +22,8 @@ Here is an example of a time trace produced with .. code-block:: bash :caption: Clang Static Analyzer invocation to generate a time trace of string.c analysis. - clang -cc1 -nostdsysteminc -analyze -analyzer-constraints=range \ - -setup-static-analyzer -analyzer-checker=core,unix,alpha.unix.cstring,debug.ExprInspection \ - -verify ./clang/test/Analysis/string.c \ + clang -cc1 -analyze -verify clang/test/Analysis/string.c \ + -analyzer-checker=core,unix,alpha.unix.cstring,debug.ExprInspection \ -ftime-trace=trace.json -ftime-trace-granularity=1 .. image:: ../images/speedscope.png @@ -45,3 +47,91 @@ Note: Both Chrome-tracing and speedscope tools might struggle with time traces a Luckily, in most cases the default max-steps boundary of 225 000 produces the traces of approximately that size for a single entry point. You can use ``-analyze-function=get_global_options`` together with ``-ftime-trace`` to narrow down analysis to a specific entry point. + + +Performance analysis using ``perf`` +=================================== + +`Perf `_ is a tool for conducting sampling-based profiling. +It's easy to start profiling, you only have 2 prerequisites. +Build with ``-fno-omit-frame-pointer`` and debug info (``-g``). +You can use release builds, but probably the easiest is to set the ``CMAKE_BUILD_TYPE=RelWithDebInfo`` +along with ``CMAKE_CXX_FLAGS="-fno-omit-frame-pointer"`` when configuring ``llvm``. +Here is how to `get started `_ if you are in trouble. + +.. code-block:: bash + :caption: Running the Clang Static Analyzer through ``perf`` to gather samples of the execution. + + # -F: Sampling frequency, use `-F max` for maximal frequency + # -g: Enable call-graph recording for both kernel and user space + perf record -F 99 -g -- clang -cc1 -analyze -verify clang/test/Analysis/string.c \ + -analyzer-checker=core,unix,alpha.unix.cstring,debug.ExprInspection + +Once you have the profile data, you can use it to produce a Flame graph. +A Flame graph is a visual representation of the stack frames of the samples. +Common stack frame prefixes are squashed together, making up a wider bar. +The wider the bar, the more time was spent under that particular stack frame, +giving a sense of how the overall execution time was spent. + +Clone the `FlameGraph `_ git repository, +as we will use some scripts from there to convert the ``perf`` samples into a Flame graph. +It's also useful to check out Brendan Gregg's (the author of FlameGraph) +`homepage `_. + + +.. code-block:: bash + :caption: Converting the ``perf`` profile into a Flamegraph, then opening it in Firefox. + + perf script | /path/to/FlameGraph/stackcollapse-perf.pl > perf.folded + /path/to/FlameGraph/flamegraph.pl perf.folded > perf.svg + firefox perf.svg + +.. image:: ../images/flamegraph.png + + +Performance analysis using ``uftrace`` +====================================== + +`uftrace `_ is a great tool to generate rich profile data +that you can use to focus and drill down into the timeline of your application. +We will use it to generate Chromium trace JSON. +In contrast to ``perf``, this approach statically instruments every function, so it should be more precise and thorough than the sampling-based approaches like ``perf``. +In contrast to using ``-ftime-trace``, functions don't need to opt-in to be profiled using ``llvm::TimeTraceScope``. +All functions are profiled due to automatic static instrumentation. + +There is only one prerequisite to use this tool. +You need to build the binary you are about to instrument using ``-pg`` or ``-finstrument-functions``. +This will make it run substantially slower but allows rich instrumentation. +It will also consume many gigabites of storage for a single trace unless filter flags are used during recording. + +.. code-block:: bash + :caption: Recording with ``uftrace``, then dumping the result as a Chrome trace JSON. + + uftrace record clang -cc1 -analyze -verify clang/test/Analysis/string.c \ + -analyzer-checker=core,unix,alpha.unix.cstring,debug.ExprInspection + uftrace dump --filter=".*::AnalysisConsumer::HandleTranslationUnit" --time-filter=300 --chrome > trace.json + +.. image:: ../images/uftrace_detailed.png + +In this picture, you can see the functions below the Static Analyzer's entry point, which takes at least 300 nanoseconds to run, visualized by Chrome's ``about:tracing`` page +You can also see how deep function calls we may have due to AST visitors. + +Using different filters can reduce the number of functions to record. +For the common options, refer to the ``uftrace`` `documentation `_. + +Similar filters can be applied for dumping too. That way you can reuse the same (detailed) +recording to selectively focus on some special part using a refinement of the filter flags. +Remember, the trace JSON needs to fit into Chrome's ``about:tracing`` or `speedscope `_, +thus it needs to be of a limited size. +If you do not apply filters on recording, you will collect a large trace and every dump operation +would need to sieve through the much larger recording which may be annoying if done repeatedly. + +If the trace JSON is still too large to load, have a look at the dump as plain text and look for frequent entries that refer to non-interesting parts. +Once you have some of those, add them as ``--hide`` flags to the ``uftrace dump`` call. +To see what functions appear frequently in the trace, use this command: + +.. code-block:: bash + + cat trace.json | grep -Po '"name":"(.+)"' | sort | uniq -c | sort -nr | head -n 50 + +``uftrace`` can also dump the report as a Flame graph using ``uftrace dump --framegraph``. diff --git a/clang/docs/analyzer/images/flamegraph.png b/clang/docs/analyzer/images/flamegraph.png new file mode 100644 index 0000000000000..b16ec90b9e600 Binary files /dev/null and b/clang/docs/analyzer/images/flamegraph.png differ diff --git a/clang/docs/analyzer/images/uftrace_detailed.png b/clang/docs/analyzer/images/uftrace_detailed.png new file mode 100644 index 0000000000000..fcf681909d070 Binary files /dev/null and b/clang/docs/analyzer/images/uftrace_detailed.png differ diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 65be782c1ba43..a96b9c0a17045 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1733,6 +1733,47 @@ class ASTContext : public RefCountedBase { unsigned NumPositiveBits, QualType &BestType, QualType &BestPromotionType); + /// Determine whether the given integral value is representable within + /// the given type T. + bool isRepresentableIntegerValue(llvm::APSInt &Value, QualType T); + + /// Compute NumNegativeBits and NumPositiveBits for an enum based on + /// the constant values of its enumerators. + template + bool computeEnumBits(RangeT EnumConstants, unsigned &NumNegativeBits, + unsigned &NumPositiveBits) { + NumNegativeBits = 0; + NumPositiveBits = 0; + bool MembersRepresentableByInt = true; + for (auto *Elem : EnumConstants) { + EnumConstantDecl *ECD = cast_or_null(Elem); + if (!ECD) + continue; // Already issued a diagnostic. + + llvm::APSInt InitVal = ECD->getInitVal(); + if (InitVal.isUnsigned() || InitVal.isNonNegative()) { + // If the enumerator is zero that should still be counted as a positive + // bit since we need a bit to store the value zero. + unsigned ActiveBits = InitVal.getActiveBits(); + NumPositiveBits = std::max({NumPositiveBits, ActiveBits, 1u}); + } else { + NumNegativeBits = + std::max(NumNegativeBits, (unsigned)InitVal.getSignificantBits()); + } + + MembersRepresentableByInt &= isRepresentableIntegerValue(InitVal, IntTy); + } + + // If we have an empty set of enumerators we still need one bit. + // From [dcl.enum]p8 + // If the enumerator-list is empty, the values of the enumeration are as if + // the enumeration had a single enumerator with value 0 + if (!NumPositiveBits && !NumNegativeBits) + NumPositiveBits = 1; + + return MembersRepresentableByInt; + } + QualType getUnresolvedUsingType(const UnresolvedUsingTypenameDecl *Decl) const; diff --git a/clang/include/clang/AST/JSONNodeDumper.h b/clang/include/clang/AST/JSONNodeDumper.h index 9422c8fceccfb..660a05435003d 100644 --- a/clang/include/clang/AST/JSONNodeDumper.h +++ b/clang/include/clang/AST/JSONNodeDumper.h @@ -345,6 +345,7 @@ class JSONNodeDumper void VisitDeclarationTemplateArgument(const TemplateArgument &TA); void VisitNullPtrTemplateArgument(const TemplateArgument &TA); void VisitIntegralTemplateArgument(const TemplateArgument &TA); + void VisitStructuralValueTemplateArgument(const TemplateArgument &TA); void VisitTemplateTemplateArgument(const TemplateArgument &TA); void VisitTemplateExpansionTemplateArgument(const TemplateArgument &TA); void VisitExpressionTemplateArgument(const TemplateArgument &TA); diff --git a/clang/include/clang/AST/Mangle.h b/clang/include/clang/AST/Mangle.h index d5f6c0f6cc67d..6134a70c04bd3 100644 --- a/clang/include/clang/AST/Mangle.h +++ b/clang/include/clang/AST/Mangle.h @@ -22,32 +22,29 @@ #include namespace llvm { - class raw_ostream; +class raw_ostream; } namespace clang { - class ASTContext; - class BlockDecl; - class CXXConstructorDecl; - class CXXDestructorDecl; - class CXXMethodDecl; - class FunctionDecl; - struct MethodVFTableLocation; - class NamedDecl; - class ObjCMethodDecl; - class StringLiteral; - struct ThisAdjustment; - struct ThunkInfo; - class VarDecl; +class ASTContext; +class BlockDecl; +class CXXConstructorDecl; +class CXXDestructorDecl; +class CXXMethodDecl; +class FunctionDecl; +struct MethodVFTableLocation; +class NamedDecl; +class ObjCMethodDecl; +class StringLiteral; +struct ThisAdjustment; +struct ThunkInfo; +class VarDecl; /// MangleContext - Context for tracking state which persists across multiple /// calls to the C++ name mangler. class MangleContext { public: - enum ManglerKind { - MK_Itanium, - MK_Microsoft - }; + enum ManglerKind { MK_Itanium, MK_Microsoft }; private: virtual void anchor(); @@ -59,10 +56,10 @@ class MangleContext { /// ASTContext. bool IsAux = false; - llvm::DenseMap GlobalBlockIds; - llvm::DenseMap LocalBlockIds; - llvm::DenseMap AnonStructIds; - llvm::DenseMap FuncAnonStructSize; + llvm::DenseMap GlobalBlockIds; + llvm::DenseMap LocalBlockIds; + llvm::DenseMap AnonStructIds; + llvm::DenseMap FuncAnonStructSize; public: ManglerKind getKind() const { return Kind; } @@ -73,7 +70,7 @@ class MangleContext { ManglerKind Kind, bool IsAux = false) : Context(Context), Diags(Diags), Kind(Kind), IsAux(IsAux) {} - virtual ~MangleContext() { } + virtual ~MangleContext() {} ASTContext &getASTContext() const { return Context; } @@ -82,10 +79,10 @@ class MangleContext { virtual void startNewFunction() { LocalBlockIds.clear(); } unsigned getBlockId(const BlockDecl *BD, bool Local) { - llvm::DenseMap &BlockIds - = Local? LocalBlockIds : GlobalBlockIds; + llvm::DenseMap &BlockIds = + Local ? LocalBlockIds : GlobalBlockIds; std::pair::iterator, bool> - Result = BlockIds.insert(std::make_pair(BD, BlockIds.size())); + Result = BlockIds.insert(std::make_pair(BD, BlockIds.size())); return Result.first->second; } @@ -125,7 +122,7 @@ class MangleContext { return false; } - virtual void needsUniqueInternalLinkageNames() { } + virtual void needsUniqueInternalLinkageNames() {} // FIXME: consider replacing raw_ostream & with something like SmallString &. void mangleName(GlobalDecl GD, raw_ostream &); @@ -143,10 +140,9 @@ class MangleContext { virtual void mangleCXXRTTIName(QualType T, raw_ostream &, bool NormalizeIntegers = false) = 0; virtual void mangleStringLiteral(const StringLiteral *SL, raw_ostream &) = 0; - virtual void mangleMSGuidDecl(const MSGuidDecl *GD, raw_ostream&); + virtual void mangleMSGuidDecl(const MSGuidDecl *GD, raw_ostream &); - void mangleGlobalBlock(const BlockDecl *BD, - const NamedDecl *ID, + void mangleGlobalBlock(const BlockDecl *BD, const NamedDecl *ID, raw_ostream &Out); void mangleCtorBlock(const CXXConstructorDecl *CD, CXXCtorType CT, const BlockDecl *BD, raw_ostream &Out); @@ -314,6 +310,6 @@ class ASTNameGenerator { class Implementation; std::unique_ptr Impl; }; -} +} // namespace clang #endif diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h index bfd205ffb0d99..4b5ad2b5fa74c 100644 --- a/clang/include/clang/AST/TextNodeDumper.h +++ b/clang/include/clang/AST/TextNodeDumper.h @@ -249,6 +249,7 @@ class TextNodeDumper void VisitDeclarationTemplateArgument(const TemplateArgument &TA); void VisitNullPtrTemplateArgument(const TemplateArgument &TA); void VisitIntegralTemplateArgument(const TemplateArgument &TA); + void VisitStructuralValueTemplateArgument(const TemplateArgument &TA); void VisitTemplateTemplateArgument(const TemplateArgument &TA); void VisitTemplateExpansionTemplateArgument(const TemplateArgument &TA); void VisitExpressionTemplateArgument(const TemplateArgument &TA); diff --git a/clang/include/clang/Basic/FPOptions.def b/clang/include/clang/Basic/FPOptions.def index 79f04c89c9fed..90428c3c73c8b 100644 --- a/clang/include/clang/Basic/FPOptions.def +++ b/clang/include/clang/Basic/FPOptions.def @@ -28,5 +28,5 @@ OPTION(FPEvalMethod, LangOptions::FPEvalMethodKind, 2, AllowApproxFunc) OPTION(Float16ExcessPrecision, LangOptions::ExcessPrecisionKind, 2, FPEvalMethod) OPTION(BFloat16ExcessPrecision, LangOptions::ExcessPrecisionKind, 2, Float16ExcessPrecision) OPTION(MathErrno, bool, 1, BFloat16ExcessPrecision) -OPTION(ComplexRange, LangOptions::ComplexRangeKind, 2, MathErrno) +OPTION(ComplexRange, LangOptions::ComplexRangeKind, 3, MathErrno) #undef OPTION diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index cb55f09acc076..bfab0baa089cf 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -238,7 +238,7 @@ BENIGN_LANGOPT(NoSignedZero , 1, 0, "Permit Floating Point optimization wit BENIGN_LANGOPT(AllowRecip , 1, 0, "Permit Floating Point reciprocal") BENIGN_LANGOPT(ApproxFunc , 1, 0, "Permit Floating Point approximation") -ENUM_LANGOPT(ComplexRange, ComplexRangeKind, 2, CX_None, "Enable use of range reduction for complex arithmetics.") +ENUM_LANGOPT(ComplexRange, ComplexRangeKind, 3, CX_None, "Enable use of range reduction for complex arithmetics.") BENIGN_LANGOPT(ObjCGCBitmapPrint , 1, 0, "printing of GC's bitmap layout for __weak/__strong ivars") diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index f58a719a45a84..651b3b67b1058 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -648,9 +648,12 @@ class LangOptions : public LangOptionsBase { // Define accessors/mutators for language options of enumeration type. #define LANGOPT(Name, Bits, Default, Description) -#define ENUM_LANGOPT(Name, Type, Bits, Default, Description) \ - Type get##Name() const { return static_cast(Name); } \ - void set##Name(Type Value) { Name = static_cast(Value); } +#define ENUM_LANGOPT(Name, Type, Bits, Default, Description) \ + Type get##Name() const { return static_cast(Name); } \ + void set##Name(Type Value) { \ + assert(static_cast(Value) < (1u << Bits)); \ + Name = static_cast(Value); \ + } #include "clang/Basic/LangOptions.def" /// Are we compiling a module? @@ -959,11 +962,14 @@ class FPOptions { void applyChanges(FPOptionsOverride FPO); // We can define most of the accessors automatically: + // TODO: consider enforcing the assertion that value fits within bits + // statically. #define OPTION(NAME, TYPE, WIDTH, PREVIOUS) \ TYPE get##NAME() const { \ return static_cast((Value & NAME##Mask) >> NAME##Shift); \ } \ void set##NAME(TYPE value) { \ + assert(storage_type(value) < (1u << WIDTH)); \ Value = (Value & ~NAME##Mask) | (storage_type(value) << NAME##Shift); \ } #include "clang/Basic/FPOptions.def" diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 1cf62ab466134..618815db28434 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3397,7 +3397,7 @@ def fno_objc_weak : Flag<["-"], "fno-objc-weak">, Group, def fno_omit_frame_pointer : Flag<["-"], "fno-omit-frame-pointer">, Group, Visibility<[ClangOption, FlangOption]>; defm operator_names : BoolFOption<"operator-names", - LangOpts<"CXXOperatorNames">, Default, + LangOpts<"CXXOperatorNames">, Default, NegFlag, PosFlag>; diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def index 34bb7a809162b..a9b8d0753673b 100644 --- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def +++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def @@ -294,6 +294,16 @@ ANALYZER_OPTION( bool, ShouldUnrollLoops, "unroll-loops", "Whether the analysis should try to unroll loops with known bounds.", false) +ANALYZER_OPTION( + bool, ShouldAssumeAtLeastOneIteration, "assume-at-least-one-iteration", + "Whether the analyzer should always assume at least one iteration in " + "loops where the loop condition is opaque (i.e. the analyzer cannot " + "determine if it's true or false). Setting this to true eliminates some " + "false positives (where e.g. a structure is nonempty, but the analyzer " + "does not notice this); but it also eliminates some true positives (e.g. " + "cases where a structure can be empty and this causes buggy behavior).", + false) + ANALYZER_OPTION( bool, ShouldDisplayNotesAsEvents, "notes-as-events", "Whether the bug reporter should transparently treat extra note diagnostic " diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index e3b44bdbe3dc5..b1b9d56ccca9f 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -5320,6 +5320,19 @@ bool ASTContext::computeBestEnumTypes(bool IsPacked, unsigned NumNegativeBits, return EnumTooLarge; } +bool ASTContext::isRepresentableIntegerValue(llvm::APSInt &Value, QualType T) { + assert((T->isIntegralType(*this) || T->isEnumeralType()) && + "Integral type required!"); + unsigned BitWidth = getIntWidth(T); + + if (Value.isUnsigned() || Value.isNonNegative()) { + if (T->isSignedIntegerOrEnumerationType()) + --BitWidth; + return Value.getActiveBits() <= BitWidth; + } + return Value.getSignificantBits() <= BitWidth; +} + QualType ASTContext::getUnresolvedUsingType( const UnresolvedUsingTypenameDecl *Decl) const { if (Decl->TypeForDecl) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 0e586725b5869..55ac41736344d 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -1683,7 +1683,7 @@ static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC, assert(!ElemT); // Structs etc. const Descriptor *Desc = S.P.createDescriptor( - Call, ElemType.getTypePtr(), Descriptor::InlineDescMD, + NewCall, ElemType.getTypePtr(), Descriptor::InlineDescMD, /*IsConst=*/false, /*IsTemporary=*/false, /*IsMutable=*/false, /*Init=*/nullptr); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 192b679b4c995..5c6ca4c9ee4de 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12536,10 +12536,9 @@ static const Expr *ignorePointerCastsAndParens(const Expr *E) { static bool isDesignatorAtObjectEnd(const ASTContext &Ctx, const LValue &LVal) { assert(!LVal.Designator.Invalid); - auto IsLastOrInvalidFieldDecl = [&Ctx](const FieldDecl *FD, bool &Invalid) { + auto IsLastOrInvalidFieldDecl = [&Ctx](const FieldDecl *FD) { const RecordDecl *Parent = FD->getParent(); - Invalid = Parent->isInvalidDecl(); - if (Invalid || Parent->isUnion()) + if (Parent->isInvalidDecl() || Parent->isUnion()) return true; const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(Parent); return FD->getFieldIndex() + 1 == Layout.getFieldCount(); @@ -12548,14 +12547,12 @@ static bool isDesignatorAtObjectEnd(const ASTContext &Ctx, const LValue &LVal) { auto &Base = LVal.getLValueBase(); if (auto *ME = dyn_cast_or_null(Base.dyn_cast())) { if (auto *FD = dyn_cast(ME->getMemberDecl())) { - bool Invalid; - if (!IsLastOrInvalidFieldDecl(FD, Invalid)) - return Invalid; + if (!IsLastOrInvalidFieldDecl(FD)) + return false; } else if (auto *IFD = dyn_cast(ME->getMemberDecl())) { for (auto *FD : IFD->chain()) { - bool Invalid; - if (!IsLastOrInvalidFieldDecl(cast(FD), Invalid)) - return Invalid; + if (!IsLastOrInvalidFieldDecl(cast(FD))) + return false; } } } @@ -12591,9 +12588,8 @@ static bool isDesignatorAtObjectEnd(const ASTContext &Ctx, const LValue &LVal) { return false; BaseType = CT->getElementType(); } else if (auto *FD = getAsField(Entry)) { - bool Invalid; - if (!IsLastOrInvalidFieldDecl(FD, Invalid)) - return Invalid; + if (!IsLastOrInvalidFieldDecl(FD)) + return false; BaseType = FD->getType(); } else { assert(getAsBaseClass(Entry) && "Expecting cast to a base class"); diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp index 169e3ee94c221..27fd214dcee3b 100644 --- a/clang/lib/AST/JSONNodeDumper.cpp +++ b/clang/lib/AST/JSONNodeDumper.cpp @@ -1705,6 +1705,10 @@ void JSONNodeDumper::VisitNullPtrTemplateArgument(const TemplateArgument &TA) { void JSONNodeDumper::VisitIntegralTemplateArgument(const TemplateArgument &TA) { JOS.attribute("value", TA.getAsIntegral().getSExtValue()); } +void JSONNodeDumper::VisitStructuralValueTemplateArgument( + const TemplateArgument &TA) { + Visit(TA.getAsStructuralValue(), TA.getStructuralValueType()); +} void JSONNodeDumper::VisitTemplateTemplateArgument(const TemplateArgument &TA) { // FIXME: cannot just call dump() on the argument, as that doesn't specify // the output format. diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 6da1f776b4b63..08efcda46b8f9 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -1226,6 +1226,12 @@ void TextNodeDumper::VisitIntegralTemplateArgument(const TemplateArgument &TA) { dumpTemplateArgument(TA); } +void TextNodeDumper::VisitStructuralValueTemplateArgument( + const TemplateArgument &TA) { + OS << " structural value"; + dumpTemplateArgument(TA); +} + void TextNodeDumper::dumpTemplateName(TemplateName TN, StringRef Label) { AddChild(Label, [=] { { diff --git a/clang/lib/Analysis/LiveVariables.cpp b/clang/lib/Analysis/LiveVariables.cpp index 481932ee59c8e..af563702b77bf 100644 --- a/clang/lib/Analysis/LiveVariables.cpp +++ b/clang/lib/Analysis/LiveVariables.cpp @@ -16,7 +16,9 @@ #include "clang/Analysis/AnalysisDeclContext.h" #include "clang/Analysis/CFG.h" #include "clang/Analysis/FlowSensitive/DataflowWorklist.h" +#include "clang/Basic/SourceManager.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -662,12 +664,19 @@ void LiveVariables::dumpExprLiveness(const SourceManager &M) { } void LiveVariablesImpl::dumpExprLiveness(const SourceManager &M) { + auto ByBeginLoc = [&M](const Expr *L, const Expr *R) { + return M.isBeforeInTranslationUnit(L->getBeginLoc(), R->getBeginLoc()); + }; + // Don't iterate over blockEndsToLiveness directly because it's not sorted. for (const CFGBlock *B : *analysisContext.getCFG()) { llvm::errs() << "\n[ B" << B->getBlockID() << " (live expressions at block exit) ]\n"; - for (const Expr *E : blocksEndToLiveness[B].liveExprs) { + std::vector LiveExprs; + llvm::append_range(LiveExprs, blocksEndToLiveness[B].liveExprs); + llvm::sort(LiveExprs, ByBeginLoc); + for (const Expr *E : LiveExprs) { llvm::errs() << "\n"; E->dump(); } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index d5b584ec0f2e9..46ad11e64c4d5 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -5122,7 +5122,7 @@ void CGDebugInfo::EmitLabel(const LabelDecl *D, CGBuilderTy &Builder) { DBuilder.insertLabel(L, llvm::DILocation::get(CGM.getLLVMContext(), Line, Column, Scope, CurInlinedAt), - Builder.GetInsertBlock()); + Builder.GetInsertBlock()->end()); } llvm::DIType *CGDebugInfo::CreateSelfType(const QualType &QualTy, @@ -5200,7 +5200,7 @@ void CGDebugInfo::EmitDeclareOfBlockDeclRefVariable( LexicalBlockStack.back(), CurInlinedAt); auto *Expr = DBuilder.createExpression(addr); if (InsertPoint) - DBuilder.insertDeclare(Storage, D, Expr, DL, InsertPoint); + DBuilder.insertDeclare(Storage, D, Expr, DL, InsertPoint->getIterator()); else DBuilder.insertDeclare(Storage, D, Expr, DL, Builder.GetInsertBlock()); } @@ -5865,7 +5865,7 @@ void CGDebugInfo::EmitPseudoVariable(CGBuilderTy &Builder, if (auto InsertPoint = Value->getInsertionPointAfterDef()) { DBuilder.insertDbgValueIntrinsic(Value, D, DBuilder.createExpression(), DIL, - &**InsertPoint); + *InsertPoint); } } diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 2ce54cc3c52ef..03ddc87d8d3df 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -54,10 +54,6 @@ void addDxilValVersion(StringRef ValVersionStr, llvm::Module &M) { auto *DXILValMD = M.getOrInsertNamedMetadata(DXILValKey); DXILValMD->addOperand(Val); } -void addDisableOptimizations(llvm::Module &M) { - StringRef Key = "dx.disable_optimizations"; - M.addModuleFlag(llvm::Module::ModFlagBehavior::Override, Key, 1); -} // cbuffer will be translated into global variable in special address space. // If translate into C, // cbuffer A { @@ -171,8 +167,6 @@ void CGHLSLRuntime::finishCodeGen() { addDxilValVersion(TargetOpts.DxilValidatorVersion, M); generateGlobalCtorDtorCalls(); - if (CGM.getCodeGenOpts().OptimizationLevel == 0) - addDisableOptimizations(M); const DataLayout &DL = M.getDataLayout(); @@ -345,6 +339,13 @@ void clang::CodeGen::CGHLSLRuntime::setHLSLEntryAttributes( WaveSizeAttr->getPreferred()); Fn->addFnAttr(WaveSizeKindStr, WaveSizeStr); } + // HLSL entry functions are materialized for module functions with + // HLSLShaderAttr attribute. SetLLVMFunctionAttributesForDefinition called + // later in the compiler-flow for such module functions is not aware of and + // hence not able to set attributes of the newly materialized entry functions. + // So, set attributes of entry function here, as appropriate. + if (CGM.getCodeGenOpts().OptimizationLevel == 0) + Fn->addFnAttr(llvm::Attribute::OptimizeNone); Fn->addFnAttr(llvm::Attribute::NoInline); } diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp index 6c929a6431c0f..16986de96bdbc 100644 --- a/clang/lib/CodeGen/CGObjCMac.cpp +++ b/clang/lib/CodeGen/CGObjCMac.cpp @@ -63,7 +63,7 @@ class ObjCCommonTypesHelper { llvm::FunctionCallee getMessageSendFn() const { // Add the non-lazy-bind attribute, since objc_msgSend is likely to // be called a lot. - llvm::Type *params[] = { ObjectPtrTy, SelectorPtrTy }; + llvm::Type *params[] = {ObjectPtrTy, SelectorPtrTy}; return CGM.CreateRuntimeFunction( llvm::FunctionType::get(ObjectPtrTy, params, true), "objc_msgSend", llvm::AttributeList::get(CGM.getLLVMContext(), @@ -77,10 +77,10 @@ class ObjCCommonTypesHelper { /// by indirect reference in the first argument, and therefore the /// self and selector parameters are shifted over by one. llvm::FunctionCallee getMessageSendStretFn() const { - llvm::Type *params[] = { ObjectPtrTy, SelectorPtrTy }; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(CGM.VoidTy, - params, true), - "objc_msgSend_stret"); + llvm::Type *params[] = {ObjectPtrTy, SelectorPtrTy}; + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(CGM.VoidTy, params, true), + "objc_msgSend_stret"); } /// [double | long double] objc_msgSend_fpret(id self, SEL op, ...) @@ -89,10 +89,10 @@ class ObjCCommonTypesHelper { /// floating-point stack; without a special entrypoint, the nil case /// would be unbalanced. llvm::FunctionCallee getMessageSendFpretFn() const { - llvm::Type *params[] = { ObjectPtrTy, SelectorPtrTy }; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(CGM.DoubleTy, - params, true), - "objc_msgSend_fpret"); + llvm::Type *params[] = {ObjectPtrTy, SelectorPtrTy}; + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(CGM.DoubleTy, params, true), + "objc_msgSend_fpret"); } /// _Complex long double objc_msgSend_fp2ret(id self, SEL op, ...) @@ -101,14 +101,14 @@ class ObjCCommonTypesHelper { /// x87 floating point stack; without a special entrypoint, the nil case /// would be unbalanced. Only used on 64-bit X86. llvm::FunctionCallee getMessageSendFp2retFn() const { - llvm::Type *params[] = { ObjectPtrTy, SelectorPtrTy }; + llvm::Type *params[] = {ObjectPtrTy, SelectorPtrTy}; llvm::Type *longDoubleType = llvm::Type::getX86_FP80Ty(VMContext); llvm::Type *resultType = llvm::StructType::get(longDoubleType, longDoubleType); - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(resultType, - params, true), - "objc_msgSend_fp2ret"); + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(resultType, params, true), + "objc_msgSend_fp2ret"); } /// id objc_msgSendSuper(struct objc_super *super, SEL op, ...) @@ -117,10 +117,10 @@ class ObjCCommonTypesHelper { /// semantics. The class passed is the superclass of the current /// class. llvm::FunctionCallee getMessageSendSuperFn() const { - llvm::Type *params[] = { SuperPtrTy, SelectorPtrTy }; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(ObjectPtrTy, - params, true), - "objc_msgSendSuper"); + llvm::Type *params[] = {SuperPtrTy, SelectorPtrTy}; + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(ObjectPtrTy, params, true), + "objc_msgSendSuper"); } /// id objc_msgSendSuper2(struct objc_super *super, SEL op, ...) @@ -128,10 +128,10 @@ class ObjCCommonTypesHelper { /// A slightly different messenger used for super calls. The class /// passed is the current class. llvm::FunctionCallee getMessageSendSuperFn2() const { - llvm::Type *params[] = { SuperPtrTy, SelectorPtrTy }; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(ObjectPtrTy, - params, true), - "objc_msgSendSuper2"); + llvm::Type *params[] = {SuperPtrTy, SelectorPtrTy}; + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(ObjectPtrTy, params, true), + "objc_msgSendSuper2"); } /// void objc_msgSendSuper_stret(void *stretAddr, struct objc_super *super, @@ -139,10 +139,10 @@ class ObjCCommonTypesHelper { /// /// The messenger used for super calls which return an aggregate indirectly. llvm::FunctionCallee getMessageSendSuperStretFn() const { - llvm::Type *params[] = { Int8PtrTy, SuperPtrTy, SelectorPtrTy }; + llvm::Type *params[] = {Int8PtrTy, SuperPtrTy, SelectorPtrTy}; return CGM.CreateRuntimeFunction( - llvm::FunctionType::get(CGM.VoidTy, params, true), - "objc_msgSendSuper_stret"); + llvm::FunctionType::get(CGM.VoidTy, params, true), + "objc_msgSendSuper_stret"); } /// void objc_msgSendSuper2_stret(void * stretAddr, struct objc_super *super, @@ -150,10 +150,10 @@ class ObjCCommonTypesHelper { /// /// objc_msgSendSuper_stret with the super2 semantics. llvm::FunctionCallee getMessageSendSuperStretFn2() const { - llvm::Type *params[] = { Int8PtrTy, SuperPtrTy, SelectorPtrTy }; + llvm::Type *params[] = {Int8PtrTy, SuperPtrTy, SelectorPtrTy}; return CGM.CreateRuntimeFunction( - llvm::FunctionType::get(CGM.VoidTy, params, true), - "objc_msgSendSuper2_stret"); + llvm::FunctionType::get(CGM.VoidTy, params, true), + "objc_msgSendSuper2_stret"); } llvm::FunctionCallee getMessageSendSuperFpretFn() const { @@ -240,9 +240,8 @@ class ObjCCommonTypesHelper { CanQualType Params[] = { IdType, SelType, Ctx.getPointerDiffType()->getCanonicalTypeUnqualified(), Ctx.BoolTy}; - llvm::FunctionType *FTy = - Types.GetFunctionType( - Types.arrangeBuiltinFunctionDeclaration(IdType, Params)); + llvm::FunctionType *FTy = Types.GetFunctionType( + Types.arrangeBuiltinFunctionDeclaration(IdType, Params)); return CGM.CreateRuntimeFunction(FTy, "objc_getProperty"); } @@ -259,9 +258,8 @@ class ObjCCommonTypesHelper { IdType, Ctx.BoolTy, Ctx.BoolTy}; - llvm::FunctionType *FTy = - Types.GetFunctionType( - Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params)); + llvm::FunctionType *FTy = Types.GetFunctionType( + Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params)); return CGM.CreateRuntimeFunction(FTy, "objc_setProperty"); } @@ -277,16 +275,15 @@ class ObjCCommonTypesHelper { // void objc_setProperty_nonatomic_copy(id self, SEL _cmd, // id newValue, ptrdiff_t offset); - SmallVector Params; + SmallVector Params; CanQualType IdType = Ctx.getCanonicalParamType(Ctx.getObjCIdType()); CanQualType SelType = Ctx.getCanonicalParamType(Ctx.getObjCSelType()); Params.push_back(IdType); Params.push_back(SelType); Params.push_back(IdType); Params.push_back(Ctx.getPointerDiffType()->getCanonicalTypeUnqualified()); - llvm::FunctionType *FTy = - Types.GetFunctionType( - Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params)); + llvm::FunctionType *FTy = Types.GetFunctionType( + Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params)); const char *name; if (atomic && copy) name = "objc_setProperty_atomic_copy"; @@ -304,15 +301,14 @@ class ObjCCommonTypesHelper { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); // void objc_copyStruct (void *, const void *, size_t, bool, bool) - SmallVector Params; + SmallVector Params; Params.push_back(Ctx.VoidPtrTy); Params.push_back(Ctx.VoidPtrTy); Params.push_back(Ctx.getSizeType()); Params.push_back(Ctx.BoolTy); Params.push_back(Ctx.BoolTy); - llvm::FunctionType *FTy = - Types.GetFunctionType( - Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params)); + llvm::FunctionType *FTy = Types.GetFunctionType( + Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params)); return CGM.CreateRuntimeFunction(FTy, "objc_copyStruct"); } @@ -323,14 +319,14 @@ class ObjCCommonTypesHelper { llvm::FunctionCallee getCppAtomicObjectFunction() { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); - /// void objc_copyCppObjectAtomic(void *dest, const void *src, void *helper); - SmallVector Params; + /// void objc_copyCppObjectAtomic(void *dest, const void *src, void + /// *helper); + SmallVector Params; Params.push_back(Ctx.VoidPtrTy); Params.push_back(Ctx.VoidPtrTy); Params.push_back(Ctx.VoidPtrTy); - llvm::FunctionType *FTy = - Types.GetFunctionType( - Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params)); + llvm::FunctionType *FTy = Types.GetFunctionType( + Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params)); return CGM.CreateRuntimeFunction(FTy, "objc_copyCppObjectAtomic"); } @@ -338,11 +334,10 @@ class ObjCCommonTypesHelper { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); // void objc_enumerationMutation (id) - SmallVector Params; + SmallVector Params; Params.push_back(Ctx.getCanonicalParamType(Ctx.getObjCIdType())); - llvm::FunctionType *FTy = - Types.GetFunctionType( - Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params)); + llvm::FunctionType *FTy = Types.GetFunctionType( + Types.arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Params)); return CGM.CreateRuntimeFunction(FTy, "objc_enumerationMutation"); } @@ -350,13 +345,12 @@ class ObjCCommonTypesHelper { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); // Class objc_lookUpClass (const char *) - SmallVector Params; + SmallVector Params; Params.push_back( - Ctx.getCanonicalType(Ctx.getPointerType(Ctx.CharTy.withConst()))); + Ctx.getCanonicalType(Ctx.getPointerType(Ctx.CharTy.withConst()))); llvm::FunctionType *FTy = Types.GetFunctionType(Types.arrangeBuiltinFunctionDeclaration( - Ctx.getCanonicalType(Ctx.getObjCClassType()), - Params)); + Ctx.getCanonicalType(Ctx.getObjCClassType()), Params)); return CGM.CreateRuntimeFunction(FTy, "objc_lookUpClass"); } @@ -364,8 +358,7 @@ class ObjCCommonTypesHelper { llvm::FunctionCallee getGcReadWeakFn() { // id objc_read_weak (id *) llvm::Type *args[] = {CGM.UnqualPtrTy}; - llvm::FunctionType *FTy = - llvm::FunctionType::get(ObjectPtrTy, args, false); + llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false); return CGM.CreateRuntimeFunction(FTy, "objc_read_weak"); } @@ -373,8 +366,7 @@ class ObjCCommonTypesHelper { llvm::FunctionCallee getGcAssignWeakFn() { // id objc_assign_weak (id, id *) llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy}; - llvm::FunctionType *FTy = - llvm::FunctionType::get(ObjectPtrTy, args, false); + llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false); return CGM.CreateRuntimeFunction(FTy, "objc_assign_weak"); } @@ -382,8 +374,7 @@ class ObjCCommonTypesHelper { llvm::FunctionCallee getGcAssignGlobalFn() { // id objc_assign_global(id, id *) llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy}; - llvm::FunctionType *FTy = - llvm::FunctionType::get(ObjectPtrTy, args, false); + llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false); return CGM.CreateRuntimeFunction(FTy, "objc_assign_global"); } @@ -391,8 +382,7 @@ class ObjCCommonTypesHelper { llvm::FunctionCallee getGcAssignThreadLocalFn() { // id objc_assign_threadlocal(id src, id * dest) llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy}; - llvm::FunctionType *FTy = - llvm::FunctionType::get(ObjectPtrTy, args, false); + llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false); return CGM.CreateRuntimeFunction(FTy, "objc_assign_threadlocal"); } @@ -400,15 +390,14 @@ class ObjCCommonTypesHelper { llvm::FunctionCallee getGcAssignIvarFn() { // id objc_assign_ivar(id, id *, ptrdiff_t) llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy, CGM.PtrDiffTy}; - llvm::FunctionType *FTy = - llvm::FunctionType::get(ObjectPtrTy, args, false); + llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false); return CGM.CreateRuntimeFunction(FTy, "objc_assign_ivar"); } /// GcMemmoveCollectableFn -- LLVM objc_memmove_collectable function. llvm::FunctionCallee GcMemmoveCollectableFn() { // void *objc_memmove_collectable(void *dst, const void *src, size_t size) - llvm::Type *args[] = { Int8PtrTy, Int8PtrTy, LongTy }; + llvm::Type *args[] = {Int8PtrTy, Int8PtrTy, LongTy}; llvm::FunctionType *FTy = llvm::FunctionType::get(Int8PtrTy, args, false); return CGM.CreateRuntimeFunction(FTy, "objc_memmove_collectable"); } @@ -417,17 +406,15 @@ class ObjCCommonTypesHelper { llvm::FunctionCallee getGcAssignStrongCastFn() { // id objc_assign_strongCast(id, id *) llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy}; - llvm::FunctionType *FTy = - llvm::FunctionType::get(ObjectPtrTy, args, false); + llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false); return CGM.CreateRuntimeFunction(FTy, "objc_assign_strongCast"); } /// ExceptionThrowFn - LLVM objc_exception_throw function. llvm::FunctionCallee getExceptionThrowFn() { // void objc_exception_throw(id) - llvm::Type *args[] = { ObjectPtrTy }; - llvm::FunctionType *FTy = - llvm::FunctionType::get(CGM.VoidTy, args, false); + llvm::Type *args[] = {ObjectPtrTy}; + llvm::FunctionType *FTy = llvm::FunctionType::get(CGM.VoidTy, args, false); return CGM.CreateRuntimeFunction(FTy, "objc_exception_throw"); } @@ -441,18 +428,16 @@ class ObjCCommonTypesHelper { /// SyncEnterFn - LLVM object_sync_enter function. llvm::FunctionCallee getSyncEnterFn() { // int objc_sync_enter (id) - llvm::Type *args[] = { ObjectPtrTy }; - llvm::FunctionType *FTy = - llvm::FunctionType::get(CGM.IntTy, args, false); + llvm::Type *args[] = {ObjectPtrTy}; + llvm::FunctionType *FTy = llvm::FunctionType::get(CGM.IntTy, args, false); return CGM.CreateRuntimeFunction(FTy, "objc_sync_enter"); } /// SyncExitFn - LLVM object_sync_exit function. llvm::FunctionCallee getSyncExitFn() { // int objc_sync_exit (id) - llvm::Type *args[] = { ObjectPtrTy }; - llvm::FunctionType *FTy = - llvm::FunctionType::get(CGM.IntTy, args, false); + llvm::Type *args[] = {ObjectPtrTy}; + llvm::FunctionType *FTy = llvm::FunctionType::get(CGM.IntTy, args, false); return CGM.CreateRuntimeFunction(FTy, "objc_sync_exit"); } @@ -553,32 +538,32 @@ class ObjCTypesHelper : public ObjCCommonTypesHelper { llvm::FunctionCallee getExceptionTryEnterFn() { llvm::Type *params[] = {CGM.UnqualPtrTy}; return CGM.CreateRuntimeFunction( - llvm::FunctionType::get(CGM.VoidTy, params, false), - "objc_exception_try_enter"); + llvm::FunctionType::get(CGM.VoidTy, params, false), + "objc_exception_try_enter"); } /// ExceptionTryExitFn - LLVM objc_exception_try_exit function. llvm::FunctionCallee getExceptionTryExitFn() { llvm::Type *params[] = {CGM.UnqualPtrTy}; return CGM.CreateRuntimeFunction( - llvm::FunctionType::get(CGM.VoidTy, params, false), - "objc_exception_try_exit"); + llvm::FunctionType::get(CGM.VoidTy, params, false), + "objc_exception_try_exit"); } /// ExceptionExtractFn - LLVM objc_exception_extract function. llvm::FunctionCallee getExceptionExtractFn() { llvm::Type *params[] = {CGM.UnqualPtrTy}; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(ObjectPtrTy, - params, false), - "objc_exception_extract"); + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(ObjectPtrTy, params, false), + "objc_exception_extract"); } /// ExceptionMatchFn - LLVM objc_exception_match function. llvm::FunctionCallee getExceptionMatchFn() { - llvm::Type *params[] = { ClassPtrTy, ObjectPtrTy }; + llvm::Type *params[] = {ClassPtrTy, ObjectPtrTy}; return CGM.CreateRuntimeFunction( - llvm::FunctionType::get(CGM.Int32Ty, params, false), - "objc_exception_match"); + llvm::FunctionType::get(CGM.Int32Ty, params, false), + "objc_exception_match"); } /// SetJmpFn - LLVM _setjmp function. @@ -670,44 +655,44 @@ class ObjCNonFragileABITypesHelper : public ObjCCommonTypesHelper { llvm::FunctionCallee getMessageSendFixupFn() { // id objc_msgSend_fixup(id, struct message_ref_t*, ...) - llvm::Type *params[] = { ObjectPtrTy, MessageRefPtrTy }; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(ObjectPtrTy, - params, true), - "objc_msgSend_fixup"); + llvm::Type *params[] = {ObjectPtrTy, MessageRefPtrTy}; + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(ObjectPtrTy, params, true), + "objc_msgSend_fixup"); } llvm::FunctionCallee getMessageSendFpretFixupFn() { // id objc_msgSend_fpret_fixup(id, struct message_ref_t*, ...) - llvm::Type *params[] = { ObjectPtrTy, MessageRefPtrTy }; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(ObjectPtrTy, - params, true), - "objc_msgSend_fpret_fixup"); + llvm::Type *params[] = {ObjectPtrTy, MessageRefPtrTy}; + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(ObjectPtrTy, params, true), + "objc_msgSend_fpret_fixup"); } llvm::FunctionCallee getMessageSendStretFixupFn() { // id objc_msgSend_stret_fixup(id, struct message_ref_t*, ...) - llvm::Type *params[] = { ObjectPtrTy, MessageRefPtrTy }; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(ObjectPtrTy, - params, true), - "objc_msgSend_stret_fixup"); + llvm::Type *params[] = {ObjectPtrTy, MessageRefPtrTy}; + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(ObjectPtrTy, params, true), + "objc_msgSend_stret_fixup"); } llvm::FunctionCallee getMessageSendSuper2FixupFn() { // id objc_msgSendSuper2_fixup (struct objc_super *, // struct _super_message_ref_t*, ...) - llvm::Type *params[] = { SuperPtrTy, SuperMessageRefPtrTy }; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(ObjectPtrTy, - params, true), - "objc_msgSendSuper2_fixup"); + llvm::Type *params[] = {SuperPtrTy, SuperMessageRefPtrTy}; + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(ObjectPtrTy, params, true), + "objc_msgSendSuper2_fixup"); } llvm::FunctionCallee getMessageSendSuper2StretFixupFn() { // id objc_msgSendSuper2_stret_fixup(struct objc_super *, // struct _super_message_ref_t*, ...) - llvm::Type *params[] = { SuperPtrTy, SuperMessageRefPtrTy }; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(ObjectPtrTy, - params, true), - "objc_msgSendSuper2_stret_fixup"); + llvm::Type *params[] = {SuperPtrTy, SuperMessageRefPtrTy}; + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(ObjectPtrTy, params, true), + "objc_msgSendSuper2_stret_fixup"); } llvm::FunctionCallee getObjCEndCatchFn() { @@ -716,10 +701,9 @@ class ObjCNonFragileABITypesHelper : public ObjCCommonTypesHelper { } llvm::FunctionCallee getObjCBeginCatchFn() { - llvm::Type *params[] = { Int8PtrTy }; - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(Int8PtrTy, - params, false), - "objc_begin_catch"); + llvm::Type *params[] = {Int8PtrTy}; + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(Int8PtrTy, params, false), "objc_begin_catch"); } /// Class objc_loadClassref (void *) @@ -733,21 +717,23 @@ class ObjCNonFragileABITypesHelper : public ObjCCommonTypesHelper { // // Also it is safe to make it readnone, since we never load or store the // classref except by calling this function. - llvm::Type *params[] = { Int8PtrPtrTy }; + llvm::Type *params[] = {Int8PtrPtrTy}; llvm::LLVMContext &C = CGM.getLLVMContext(); - llvm::AttributeSet AS = llvm::AttributeSet::get(C, { - llvm::Attribute::get(C, llvm::Attribute::NonLazyBind), - llvm::Attribute::getWithMemoryEffects(C, llvm::MemoryEffects::none()), - llvm::Attribute::get(C, llvm::Attribute::NoUnwind), - }); + llvm::AttributeSet AS = llvm::AttributeSet::get( + C, { + llvm::Attribute::get(C, llvm::Attribute::NonLazyBind), + llvm::Attribute::getWithMemoryEffects( + C, llvm::MemoryEffects::none()), + llvm::Attribute::get(C, llvm::Attribute::NoUnwind), + }); llvm::FunctionCallee F = CGM.CreateRuntimeFunction( llvm::FunctionType::get(ClassnfABIPtrTy, params, false), "objc_loadClassref", llvm::AttributeList::get(CGM.getLLVMContext(), llvm::AttributeList::FunctionIndex, AS)); if (!CGM.getTriple().isOSBinFormatCOFF()) - cast(F.getCallee())->setLinkage( - llvm::Function::ExternalWeakLinkage); + cast(F.getCallee()) + ->setLinkage(llvm::Function::ExternalWeakLinkage); return F; } @@ -772,9 +758,10 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { unsigned skip; unsigned scan; SKIP_SCAN(unsigned _skip = 0, unsigned _scan = 0) - : skip(_skip), scan(_scan) {} + : skip(_skip), scan(_scan) {} }; + // clang-format off /// opcode for captured block variables layout 'instructions'. /// In the following descriptions, 'I' is the value of the immediate field. /// (field following the opcode). @@ -821,11 +808,12 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { /// /// This is included so that older tools can at least continue /// processing the layout past such things. - //BLOCK_LAYOUT_OWNERSHIP_UNKNOWN = 7..10, + // BLOCK_LAYOUT_OWNERSHIP_UNKNOWN = 7..10, /// All other opcodes are reserved. Halt interpretation and /// treat everything else as opaque. }; + // clang-format on class RUN_SKIP { public: @@ -835,7 +823,7 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { RUN_SKIP(enum BLOCK_LAYOUT_OPCODE Opcode = BLOCK_LAYOUT_OPERATOR, CharUnits BytePos = CharUnits::Zero(), CharUnits Size = CharUnits::Zero()) - : opcode(Opcode), block_var_bytepos(BytePos), block_var_size(Size) {} + : opcode(Opcode), block_var_bytepos(BytePos), block_var_size(Size) {} // Allow sorting based on byte pos. bool operator<(const RUN_SKIP &b) const { @@ -853,70 +841,71 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { /// LazySymbols - Symbols to generate a lazy reference for. See /// DefinedSymbols and FinishModule(). - llvm::SetVector LazySymbols; + llvm::SetVector LazySymbols; /// DefinedSymbols - External symbols which are defined by this /// module. The symbols in this list and LazySymbols are used to add /// special linker symbols which ensure that Objective-C modules are /// linked properly. - llvm::SetVector DefinedSymbols; + llvm::SetVector DefinedSymbols; /// ClassNames - uniqued class names. - llvm::StringMap ClassNames; + llvm::StringMap ClassNames; /// MethodVarNames - uniqued method variable names. - llvm::DenseMap MethodVarNames; + llvm::DenseMap MethodVarNames; /// DefinedCategoryNames - list of category names in form Class_Category. llvm::SmallSetVector DefinedCategoryNames; /// MethodVarTypes - uniqued method type signatures. We have to use /// a StringMap here because have no other unique reference. - llvm::StringMap MethodVarTypes; + llvm::StringMap MethodVarTypes; /// MethodDefinitions - map of methods which have been defined in /// this translation unit. - llvm::DenseMap MethodDefinitions; + llvm::DenseMap MethodDefinitions; /// DirectMethodDefinitions - map of direct methods which have been defined in /// this translation unit. - llvm::DenseMap DirectMethodDefinitions; + llvm::DenseMap + DirectMethodDefinitions; /// PropertyNames - uniqued method variable names. - llvm::DenseMap PropertyNames; + llvm::DenseMap PropertyNames; /// ClassReferences - uniqued class references. - llvm::DenseMap ClassReferences; + llvm::DenseMap ClassReferences; /// SelectorReferences - uniqued selector references. - llvm::DenseMap SelectorReferences; + llvm::DenseMap SelectorReferences; /// Protocols - Protocols for which an objc_protocol structure has /// been emitted. Forward declarations are handled by creating an /// empty structure whose initializer is filled in when/if defined. - llvm::DenseMap Protocols; + llvm::DenseMap Protocols; /// DefinedProtocols - Protocols which have actually been /// defined. We should not need this, see FIXME in GenerateProtocol. - llvm::DenseSet DefinedProtocols; + llvm::DenseSet DefinedProtocols; /// DefinedClasses - List of defined classes. - SmallVector DefinedClasses; + SmallVector DefinedClasses; /// ImplementedClasses - List of @implemented classes. - SmallVector ImplementedClasses; + SmallVector ImplementedClasses; /// DefinedNonLazyClasses - List of defined "non-lazy" classes. - SmallVector DefinedNonLazyClasses; + SmallVector DefinedNonLazyClasses; /// DefinedCategories - List of defined categories. - SmallVector DefinedCategories; + SmallVector DefinedCategories; /// DefinedStubCategories - List of defined categories on class stubs. - SmallVector DefinedStubCategories; + SmallVector DefinedStubCategories; /// DefinedNonLazyCategories - List of defined "non-lazy" categories. - SmallVector DefinedNonLazyCategories; + SmallVector DefinedNonLazyCategories; /// Cached reference to the class for constant strings. This value has type /// int * but is actually an Obj-C class pointer. @@ -963,10 +952,8 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { /// building a weak layout. Does not guarantee that the layout will /// actually have any entries, because the ivar might be under-aligned. llvm::Constant *BuildIvarLayout(const ObjCImplementationDecl *OI, - CharUnits beginOffset, - CharUnits endOffset, - bool forStrongLayout, - bool hasMRCWeakIvars); + CharUnits beginOffset, CharUnits endOffset, + bool forStrongLayout, bool hasMRCWeakIvars); llvm::Constant *BuildStrongIvarLayout(const ObjCImplementationDecl *OI, CharUnits beginOffset, @@ -981,22 +968,19 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { return BuildIvarLayout(OI, beginOffset, endOffset, false, hasMRCWeakIvars); } - Qualifiers::ObjCLifetime getBlockCaptureLifetime(QualType QT, bool ByrefLayout); + Qualifiers::ObjCLifetime getBlockCaptureLifetime(QualType QT, + bool ByrefLayout); - void UpdateRunSkipBlockVars(bool IsByref, - Qualifiers::ObjCLifetime LifeTime, - CharUnits FieldOffset, - CharUnits FieldSize); + void UpdateRunSkipBlockVars(bool IsByref, Qualifiers::ObjCLifetime LifeTime, + CharUnits FieldOffset, CharUnits FieldSize); - void BuildRCBlockVarRecordLayout(const RecordType *RT, - CharUnits BytePos, bool &HasUnion, - bool ByrefLayout=false); + void BuildRCBlockVarRecordLayout(const RecordType *RT, CharUnits BytePos, + bool &HasUnion, bool ByrefLayout = false); void BuildRCRecordLayout(const llvm::StructLayout *RecLayout, const RecordDecl *RD, - ArrayRef RecFields, - CharUnits BytePos, bool &HasUnion, - bool ByrefLayout); + ArrayRef RecFields, + CharUnits BytePos, bool &HasUnion, bool ByrefLayout); uint64_t InlineLayoutInstruction(SmallVectorImpl &Layout); @@ -1009,17 +993,16 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { /// EmitPropertyList - Emit the given property list. The return /// value has type PropertyListPtrTy. - llvm::Constant *EmitPropertyList(Twine Name, - const Decl *Container, + llvm::Constant *EmitPropertyList(Twine Name, const Decl *Container, const ObjCContainerDecl *OCD, const ObjCCommonTypesHelper &ObjCTypes, bool IsClassProperty); /// EmitProtocolMethodTypes - Generate the array of extended method type /// strings. The return value has type Int8PtrPtrTy. - llvm::Constant *EmitProtocolMethodTypes(Twine Name, - ArrayRef MethodTypes, - const ObjCCommonTypesHelper &ObjCTypes); + llvm::Constant * + EmitProtocolMethodTypes(Twine Name, ArrayRef MethodTypes, + const ObjCCommonTypesHelper &ObjCTypes); /// GetProtocolRef - Return a reference to the internal protocol /// description, creating an empty one if it has not been @@ -1053,8 +1036,7 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { ConstantStructBuilder &Init, StringRef Section, CharUnits Align, bool AddToUsed); - llvm::GlobalVariable *CreateMetadataVar(Twine Name, - llvm::Constant *Init, + llvm::GlobalVariable *CreateMetadataVar(Twine Name, llvm::Constant *Init, StringRef Section, CharUnits Align, bool AddToUsed); @@ -1065,12 +1047,9 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { protected: CodeGen::RValue EmitMessageSend(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot Return, - QualType ResultType, - Selector Sel, - llvm::Value *Arg0, - QualType Arg0Ty, - bool IsSuper, + ReturnValueSlot Return, QualType ResultType, + Selector Sel, llvm::Value *Arg0, + QualType Arg0Ty, bool IsSuper, const CallArgList &CallArgs, const ObjCMethodDecl *OMD, const ObjCInterfaceDecl *ClassReceiver, @@ -1084,15 +1063,14 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { CGObjCCommonMac(CodeGen::CodeGenModule &cgm) : CGObjCRuntime(cgm), VMContext(cgm.getLLVMContext()) {} - bool isNonFragileABI() const { - return ObjCABI == 2; - } + bool isNonFragileABI() const { return ObjCABI == 2; } ConstantAddress GenerateConstantString(const StringLiteral *SL) override; ConstantAddress GenerateConstantNSString(const StringLiteral *SL); - llvm::Function *GenerateMethod(const ObjCMethodDecl *OMD, - const ObjCContainerDecl *CD=nullptr) override; + llvm::Function * + GenerateMethod(const ObjCMethodDecl *OMD, + const ObjCContainerDecl *CD = nullptr) override; llvm::Function *GenerateDirectMethod(const ObjCMethodDecl *OMD, const ObjCContainerDecl *CD); @@ -1107,7 +1085,7 @@ class CGObjCCommonMac : public CodeGen::CGObjCRuntime { /// object for the given declaration, emitting it if needed. These /// forward references will be filled in with empty bodies if no /// definition is seen. The return value has type ProtocolPtrTy. - virtual llvm::Constant *GetOrEmitProtocolRef(const ObjCProtocolDecl *PD)=0; + virtual llvm::Constant *GetOrEmitProtocolRef(const ObjCProtocolDecl *PD) = 0; virtual llvm::Constant *getNSConstantStringClassRef() = 0; @@ -1148,9 +1126,7 @@ class ProtocolMethodLists { OptionalInstanceMethods, OptionalClassMethods }; - enum { - NumProtocolMethodLists = 4 - }; + enum { NumProtocolMethodLists = 4 }; static MethodListType getMethodListKind(Kind kind) { switch (kind) { @@ -1172,8 +1148,8 @@ class ProtocolMethodLists { ProtocolMethodLists result; for (auto *MD : PD->methods()) { - size_t index = (2 * size_t(MD->isOptional())) - + (size_t(MD->isClassMethod())); + size_t index = + (2 * size_t(MD->isOptional())) + (size_t(MD->isClassMethod())); result.Methods[index].push_back(MD); } @@ -1181,14 +1157,14 @@ class ProtocolMethodLists { } template - SmallVector emitExtendedTypesArray(Self *self) const { + SmallVector emitExtendedTypesArray(Self *self) const { // In both ABIs, the method types list is parallel with the // concatenation of the methods arrays in the following order: // instance methods // class methods // optional instance methods // optional class methods - SmallVector result; + SmallVector result; // Methods is already in the correct order for both ABIs. for (auto &list : Methods) { @@ -1233,16 +1209,13 @@ class CGObjCMac : public CGObjCCommonMac { /// has type ClassExtensionPtrTy. llvm::Constant *EmitClassExtension(const ObjCImplementationDecl *ID, CharUnits instanceSize, - bool hasMRCWeakIvars, - bool isMetaclass); + bool hasMRCWeakIvars, bool isMetaclass); /// EmitClassRef - Return a Value*, of type ObjCTypes.ClassPtrTy, /// for the given class. - llvm::Value *EmitClassRef(CodeGenFunction &CGF, - const ObjCInterfaceDecl *ID); + llvm::Value *EmitClassRef(CodeGenFunction &CGF, const ObjCInterfaceDecl *ID); - llvm::Value *EmitClassRefFromId(CodeGenFunction &CGF, - IdentifierInfo *II); + llvm::Value *EmitClassRefFromId(CodeGenFunction &CGF, IdentifierInfo *II); llvm::Value *EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF) override; @@ -1254,8 +1227,7 @@ class CGObjCMac : public CGObjCCommonMac { /// (i.e. metaclass ivars) is emitted, otherwise the list of /// interface ivars will be emitted. The return value has type /// IvarListPtrTy. - llvm::Constant *EmitIvarList(const ObjCImplementationDecl *ID, - bool ForClass); + llvm::Constant *EmitIvarList(const ObjCImplementationDecl *ID, bool ForClass); /// EmitMetaClass - Emit a forward reference to the class structure /// for the metaclass of the given interface. The return value has @@ -1294,9 +1266,8 @@ class CGObjCMac : public CGObjCCommonMac { /// structure used to store optional instance and class methods, and /// protocol properties. The return value has type /// ProtocolExtensionPtrTy. - llvm::Constant * - EmitProtocolExtension(const ObjCProtocolDecl *PD, - const ProtocolMethodLists &methodLists); + llvm::Constant *EmitProtocolExtension(const ObjCProtocolDecl *PD, + const ProtocolMethodLists &methodLists); /// EmitProtocolList - Generate the list of referenced /// protocols. The return value has type ProtocolListPtrTy. @@ -1318,19 +1289,17 @@ class CGObjCMac : public CGObjCCommonMac { CodeGen::RValue GenerateMessageSend(CodeGen::CodeGenFunction &CGF, ReturnValueSlot Return, - QualType ResultType, - Selector Sel, llvm::Value *Receiver, + QualType ResultType, Selector Sel, + llvm::Value *Receiver, const CallArgList &CallArgs, const ObjCInterfaceDecl *Class, const ObjCMethodDecl *Method) override; - CodeGen::RValue - GenerateMessageSendSuper(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot Return, QualType ResultType, - Selector Sel, const ObjCInterfaceDecl *Class, - bool isCategoryImpl, llvm::Value *Receiver, - bool IsClassMessage, const CallArgList &CallArgs, - const ObjCMethodDecl *Method) override; + CodeGen::RValue GenerateMessageSendSuper( + CodeGen::CodeGenFunction &CGF, ReturnValueSlot Return, + QualType ResultType, Selector Sel, const ObjCInterfaceDecl *Class, + bool isCategoryImpl, llvm::Value *Receiver, bool IsClassMessage, + const CallArgList &CallArgs, const ObjCMethodDecl *Method) override; llvm::Value *GetClass(CodeGenFunction &CGF, const ObjCInterfaceDecl *ID) override; @@ -1370,22 +1339,19 @@ class CGObjCMac : public CGObjCCommonMac { const ObjCAtSynchronizedStmt &S) override; void EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, const Stmt &S); void EmitThrowStmt(CodeGen::CodeGenFunction &CGF, const ObjCAtThrowStmt &S, - bool ClearInsertionPoint=true) override; - llvm::Value * EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF, - Address AddrWeakObj) override; - void EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF, - llvm::Value *src, Address dst) override; - void EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF, - llvm::Value *src, Address dest, - bool threadlocal = false) override; - void EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF, - llvm::Value *src, Address dest, - llvm::Value *ivarOffset) override; - void EmitObjCStrongCastAssign(CodeGen::CodeGenFunction &CGF, - llvm::Value *src, Address dest) override; - void EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF, - Address dest, Address src, - llvm::Value *size) override; + bool ClearInsertionPoint = true) override; + llvm::Value *EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF, + Address AddrWeakObj) override; + void EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, + Address dst) override; + void EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, + Address dest, bool threadlocal = false) override; + void EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, + Address dest, llvm::Value *ivarOffset) override; + void EmitObjCStrongCastAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, + Address dest) override; + void EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF, Address dest, + Address src, llvm::Value *size) override; LValue EmitObjCValueForIvar(CodeGen::CodeGenFunction &CGF, QualType ObjectTy, llvm::Value *BaseValue, const ObjCIvarDecl *Ivar, @@ -1399,24 +1365,24 @@ class CGObjCNonFragileABIMac : public CGObjCCommonMac { private: friend ProtocolMethodLists; ObjCNonFragileABITypesHelper ObjCTypes; - llvm::GlobalVariable* ObjCEmptyCacheVar; - llvm::Constant* ObjCEmptyVtableVar; + llvm::GlobalVariable *ObjCEmptyCacheVar; + llvm::Constant *ObjCEmptyVtableVar; /// SuperClassReferences - uniqued super class references. - llvm::DenseMap SuperClassReferences; + llvm::DenseMap SuperClassReferences; /// MetaClassReferences - uniqued meta class references. - llvm::DenseMap MetaClassReferences; + llvm::DenseMap MetaClassReferences; /// EHTypeReferences - uniqued class ehtype references. - llvm::DenseMap EHTypeReferences; + llvm::DenseMap EHTypeReferences; /// VTableDispatchMethods - List of methods for which we generate /// vtable-based message dispatch. llvm::DenseSet VTableDispatchMethods; /// DefinedMetaClasses - List of defined meta-classes. - std::vector DefinedMetaClasses; + std::vector DefinedMetaClasses; /// isVTableDispatchedSelector - Returns true if SEL is a /// vtable-based selector. @@ -1431,20 +1397,17 @@ class CGObjCNonFragileABIMac : public CGObjCCommonMac { void AddModuleClassList(ArrayRef Container, StringRef SymbolName, StringRef SectionName); - llvm::GlobalVariable * BuildClassRoTInitializer(unsigned flags, - unsigned InstanceStart, - unsigned InstanceSize, - const ObjCImplementationDecl *ID); - llvm::GlobalVariable *BuildClassObject(const ObjCInterfaceDecl *CI, - bool isMetaclass, - llvm::Constant *IsAGV, - llvm::Constant *SuperClassGV, - llvm::Constant *ClassRoGV, - bool HiddenVisibility); + llvm::GlobalVariable * + BuildClassRoTInitializer(unsigned flags, unsigned InstanceStart, + unsigned InstanceSize, + const ObjCImplementationDecl *ID); + llvm::GlobalVariable * + BuildClassObject(const ObjCInterfaceDecl *CI, bool isMetaclass, + llvm::Constant *IsAGV, llvm::Constant *SuperClassGV, + llvm::Constant *ClassRoGV, bool HiddenVisibility); void emitMethodConstant(ConstantArrayBuilder &builder, - const ObjCMethodDecl *MD, - bool forProtocol); + const ObjCMethodDecl *MD, bool forProtocol); /// Emit the method list for the given implementation. The return value /// has type MethodListnfABITy. @@ -1479,23 +1442,17 @@ class CGObjCNonFragileABIMac : public CGObjCCommonMac { ObjCProtocolDecl::protocol_iterator begin, ObjCProtocolDecl::protocol_iterator end); - CodeGen::RValue EmitVTableMessageSend(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot Return, - QualType ResultType, - Selector Sel, - llvm::Value *Receiver, - QualType Arg0Ty, - bool IsSuper, - const CallArgList &CallArgs, - const ObjCMethodDecl *Method); + CodeGen::RValue EmitVTableMessageSend( + CodeGen::CodeGenFunction &CGF, ReturnValueSlot Return, + QualType ResultType, Selector Sel, llvm::Value *Receiver, QualType Arg0Ty, + bool IsSuper, const CallArgList &CallArgs, const ObjCMethodDecl *Method); /// GetClassGlobal - Return the global variable for the Objective-C /// class of the given name. llvm::Constant *GetClassGlobal(StringRef Name, ForDefinition_t IsForDefinition, bool Weak = false, bool DLLImport = false); - llvm::Constant *GetClassGlobal(const ObjCInterfaceDecl *ID, - bool isMetaclass, + llvm::Constant *GetClassGlobal(const ObjCInterfaceDecl *ID, bool isMetaclass, ForDefinition_t isForDefinition); llvm::Constant *GetClassGlobalForClassRef(const ObjCInterfaceDecl *ID); @@ -1506,11 +1463,9 @@ class CGObjCNonFragileABIMac : public CGObjCCommonMac { /// EmitClassRef - Return a Value*, of type ObjCTypes.ClassPtrTy, /// for the given class reference. - llvm::Value *EmitClassRef(CodeGenFunction &CGF, - const ObjCInterfaceDecl *ID); + llvm::Value *EmitClassRef(CodeGenFunction &CGF, const ObjCInterfaceDecl *ID); - llvm::Value *EmitClassRefFromId(CodeGenFunction &CGF, - IdentifierInfo *II, + llvm::Value *EmitClassRefFromId(CodeGenFunction &CGF, IdentifierInfo *II, const ObjCInterfaceDecl *ID); llvm::Value *EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF) override; @@ -1528,9 +1483,8 @@ class CGObjCNonFragileABIMac : public CGObjCCommonMac { /// ObjCIvarOffsetVariable - Returns the ivar offset variable for /// the given ivar. /// - llvm::GlobalVariable * ObjCIvarOffsetVariable( - const ObjCInterfaceDecl *ID, - const ObjCIvarDecl *Ivar); + llvm::GlobalVariable *ObjCIvarOffsetVariable(const ObjCInterfaceDecl *ID, + const ObjCIvarDecl *Ivar); /// EmitSelector - Return a Value*, of type ObjCTypes.SelectorPtrTy, /// for the given selector. @@ -1547,16 +1501,15 @@ class CGObjCNonFragileABIMac : public CGObjCCommonMac { StringRef getClassSymbolPrefix() const { return "OBJC_CLASS_$_"; } void GetClassSizeInfo(const ObjCImplementationDecl *OID, - uint32_t &InstanceStart, - uint32_t &InstanceSize); + uint32_t &InstanceStart, uint32_t &InstanceSize); // Shamelessly stolen from Analysis/CFRefCount.cpp - Selector GetNullarySelector(const char* name) const { + Selector GetNullarySelector(const char *name) const { const IdentifierInfo *II = &CGM.getContext().Idents.get(name); return CGM.getContext().Selectors.getSelector(0, &II); } - Selector GetUnarySelector(const char* name) const { + Selector GetUnarySelector(const char *name) const { const IdentifierInfo *II = &CGM.getContext().Idents.get(name); return CGM.getContext().Selectors.getSelector(1, &II); } @@ -1582,7 +1535,7 @@ class CGObjCNonFragileABIMac : public CGObjCCommonMac { // and that the method may be inlined, this optimization actually // can't be performed. if (const ObjCMethodDecl *MD = - dyn_cast_or_null(CGF.CurFuncDecl)) + dyn_cast_or_null(CGF.CurFuncDecl)) if (MD->isInstanceMethod() && !MD->isDirectMethod()) if (const ObjCInterfaceDecl *ID = MD->getClassInterface()) return IV->getContainingInterface()->isSuperClassOf(ID); @@ -1621,27 +1574,28 @@ class CGObjCNonFragileABIMac : public CGObjCCommonMac { const ObjCInterfaceDecl *Class, const ObjCMethodDecl *Method) override; - CodeGen::RValue - GenerateMessageSendSuper(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot Return, QualType ResultType, - Selector Sel, const ObjCInterfaceDecl *Class, - bool isCategoryImpl, llvm::Value *Receiver, - bool IsClassMessage, const CallArgList &CallArgs, - const ObjCMethodDecl *Method) override; + CodeGen::RValue GenerateMessageSendSuper( + CodeGen::CodeGenFunction &CGF, ReturnValueSlot Return, + QualType ResultType, Selector Sel, const ObjCInterfaceDecl *Class, + bool isCategoryImpl, llvm::Value *Receiver, bool IsClassMessage, + const CallArgList &CallArgs, const ObjCMethodDecl *Method) override; llvm::Value *GetClass(CodeGenFunction &CGF, const ObjCInterfaceDecl *ID) override; - llvm::Value *GetSelector(CodeGenFunction &CGF, Selector Sel) override - { return EmitSelector(CGF, Sel); } - Address GetAddrOfSelector(CodeGenFunction &CGF, Selector Sel) override - { return EmitSelectorAddr(Sel); } + llvm::Value *GetSelector(CodeGenFunction &CGF, Selector Sel) override { + return EmitSelector(CGF, Sel); + } + Address GetAddrOfSelector(CodeGenFunction &CGF, Selector Sel) override { + return EmitSelectorAddr(Sel); + } /// The NeXT/Apple runtimes do not support typed selectors; just emit an /// untyped one. llvm::Value *GetSelector(CodeGenFunction &CGF, - const ObjCMethodDecl *Method) override - { return EmitSelector(CGF, Method->getSelector()); } + const ObjCMethodDecl *Method) override { + return EmitSelector(CGF, Method->getSelector()); + } void GenerateCategory(const ObjCCategoryImplDecl *CMD) override; @@ -1691,22 +1645,19 @@ class CGObjCNonFragileABIMac : public CGObjCCommonMac { void EmitSynchronizedStmt(CodeGen::CodeGenFunction &CGF, const ObjCAtSynchronizedStmt &S) override; void EmitThrowStmt(CodeGen::CodeGenFunction &CGF, const ObjCAtThrowStmt &S, - bool ClearInsertionPoint=true) override; - llvm::Value * EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF, - Address AddrWeakObj) override; - void EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF, - llvm::Value *src, Address edst) override; - void EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF, - llvm::Value *src, Address dest, - bool threadlocal = false) override; - void EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF, - llvm::Value *src, Address dest, - llvm::Value *ivarOffset) override; - void EmitObjCStrongCastAssign(CodeGen::CodeGenFunction &CGF, - llvm::Value *src, Address dest) override; - void EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF, - Address dest, Address src, - llvm::Value *size) override; + bool ClearInsertionPoint = true) override; + llvm::Value *EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF, + Address AddrWeakObj) override; + void EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, + Address edst) override; + void EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, + Address dest, bool threadlocal = false) override; + void EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, + Address dest, llvm::Value *ivarOffset) override; + void EmitObjCStrongCastAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, + Address dest) override; + void EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF, Address dest, + Address src, llvm::Value *size) override; LValue EmitObjCValueForIvar(CodeGen::CodeGenFunction &CGF, QualType ObjectTy, llvm::Value *BaseValue, const ObjCIvarDecl *Ivar, unsigned CVRQualifiers) override; @@ -1740,14 +1691,12 @@ struct NullReturnState { /// Complete the null-return operation. It is valid to call this /// regardless of whether 'init' has been called. - RValue complete(CodeGenFunction &CGF, - ReturnValueSlot returnSlot, - RValue result, - QualType resultType, - const CallArgList &CallArgs, - const ObjCMethodDecl *Method) { + RValue complete(CodeGenFunction &CGF, ReturnValueSlot returnSlot, + RValue result, QualType resultType, + const CallArgList &CallArgs, const ObjCMethodDecl *Method) { // If we never had to do a null-check, just use the raw result. - if (!NullBB) return result; + if (!NullBB) + return result; // The continuation block. This will be left null if we don't have an // IP, which can happen if the method we're calling is marked noreturn. @@ -1774,7 +1723,8 @@ struct NullReturnState { // If we've got a void return, just jump to the continuation block. if (result.isScalar() && resultType->isVoidType()) { // No jumps required if the message-send was noreturn. - if (contBB) CGF.EmitBlock(contBB); + if (contBB) + CGF.EmitBlock(contBB); return result; } @@ -1785,7 +1735,8 @@ struct NullReturnState { CGF.EmitFromMemory(CGF.CGM.EmitNullConstant(resultType), resultType); // If no join is necessary, just flow out. - if (!contBB) return RValue::get(null); + if (!contBB) + return RValue::get(null); // Otherwise, build a phi. CGF.EmitBlock(contBB); @@ -1803,7 +1754,8 @@ struct NullReturnState { assert(result.isAggregate() && "null init of non-aggregate result?"); if (!returnSlot.isUnused()) CGF.EmitNullInitialization(result.getAggregateAddress(), resultType); - if (contBB) CGF.EmitBlock(contBB); + if (contBB) + CGF.EmitBlock(contBB); return result; } @@ -1835,9 +1787,8 @@ static llvm::Constant *getConstantGEP(llvm::LLVMContext &VMContext, llvm::GlobalVariable *C, unsigned idx0, unsigned idx1) { llvm::Value *Idxs[] = { - llvm::ConstantInt::get(llvm::Type::getInt32Ty(VMContext), idx0), - llvm::ConstantInt::get(llvm::Type::getInt32Ty(VMContext), idx1) - }; + llvm::ConstantInt::get(llvm::Type::getInt32Ty(VMContext), idx0), + llvm::ConstantInt::get(llvm::Type::getInt32Ty(VMContext), idx1)}; return llvm::ConstantExpr::getGetElementPtr(C->getValueType(), C, Idxs); } @@ -1863,7 +1814,7 @@ getLinkageTypeForObjCMetadata(CodeGenModule &CGM, StringRef Section) { /// A helper function to create an internal or private global variable. static llvm::GlobalVariable * finishAndCreateGlobal(ConstantInitBuilder::StructBuilder &Builder, - const llvm::Twine &Name, CodeGenModule &CGM) { + const llvm::Twine &Name, CodeGenModule &CGM) { std::string SectionName; if (CGM.getTriple().isOSBinFormatMachO()) SectionName = "__DATA, __objc_const"; @@ -1876,8 +1827,8 @@ finishAndCreateGlobal(ConstantInitBuilder::StructBuilder &Builder, /* *** CGObjCMac Public Interface *** */ -CGObjCMac::CGObjCMac(CodeGen::CodeGenModule &cgm) : CGObjCCommonMac(cgm), - ObjCTypes(cgm) { +CGObjCMac::CGObjCMac(CodeGen::CodeGenModule &cgm) + : CGObjCCommonMac(cgm), ObjCTypes(cgm) { ObjCABI = 1; EmitImageInfo(); } @@ -1896,24 +1847,22 @@ llvm::Value *CGObjCMac::GetSelector(CodeGenFunction &CGF, Selector Sel) { Address CGObjCMac::GetAddrOfSelector(CodeGenFunction &CGF, Selector Sel) { return EmitSelectorAddr(Sel); } -llvm::Value *CGObjCMac::GetSelector(CodeGenFunction &CGF, const ObjCMethodDecl - *Method) { +llvm::Value *CGObjCMac::GetSelector(CodeGenFunction &CGF, + const ObjCMethodDecl *Method) { return EmitSelector(CGF, Method->getSelector()); } llvm::Constant *CGObjCMac::GetEHType(QualType T) { - if (T->isObjCIdType() || - T->isObjCQualifiedIdType()) { + if (T->isObjCIdType() || T->isObjCQualifiedIdType()) { return CGM.GetAddrOfRTTIDescriptor( - CGM.getContext().getObjCIdRedefinitionType(), /*ForEH=*/true); + CGM.getContext().getObjCIdRedefinitionType(), /*ForEH=*/true); } - if (T->isObjCClassType() || - T->isObjCQualifiedClassType()) { + if (T->isObjCClassType() || T->isObjCQualifiedClassType()) { return CGM.GetAddrOfRTTIDescriptor( - CGM.getContext().getObjCClassRedefinitionType(), /*ForEH=*/true); + CGM.getContext().getObjCClassRedefinitionType(), /*ForEH=*/true); } if (T->isObjCObjectPointerType()) - return CGM.GetAddrOfRTTIDescriptor(T, /*ForEH=*/true); + return CGM.GetAddrOfRTTIDescriptor(T, /*ForEH=*/true); llvm_unreachable("asking for catch type for ObjC type in fragile runtime"); } @@ -1940,8 +1889,8 @@ llvm::Constant *CGObjCMac::GetEHType(QualType T) { ConstantAddress CGObjCCommonMac::GenerateConstantString(const StringLiteral *SL) { return (!CGM.getLangOpts().NoConstantCFStrings - ? CGM.GetAddrOfConstantCFString(SL) - : GenerateConstantNSString(SL)); + ? CGM.GetAddrOfConstantCFString(SL) + : GenerateConstantNSString(SL)); } static llvm::StringMapEntry & @@ -1957,9 +1906,8 @@ llvm::Constant *CGObjCMac::getNSConstantStringClassRef() { return cast(V); auto &StringClass = CGM.getLangOpts().ObjCConstantStringClass; - std::string str = - StringClass.empty() ? "_NSConstantStringClassReference" - : "_" + StringClass + "ClassReference"; + std::string str = StringClass.empty() ? "_NSConstantStringClassReference" + : "_" + StringClass + "ClassReference"; llvm::Type *PTy = llvm::ArrayType::get(CGM.IntTy, 0); auto GV = CGM.CreateRuntimeVariable(PTy, str); @@ -1972,9 +1920,8 @@ llvm::Constant *CGObjCNonFragileABIMac::getNSConstantStringClassRef() { return cast(V); auto &StringClass = CGM.getLangOpts().ObjCConstantStringClass; - std::string str = - StringClass.empty() ? "OBJC_CLASS_$_NSConstantString" - : "OBJC_CLASS_$_" + StringClass; + std::string str = StringClass.empty() ? "OBJC_CLASS_$_NSConstantString" + : "OBJC_CLASS_$_" + StringClass; llvm::Constant *GV = GetClassGlobal(str, NotForDefinition); ConstantStringClassRef = GV; return GV; @@ -1984,11 +1931,11 @@ ConstantAddress CGObjCCommonMac::GenerateConstantNSString(const StringLiteral *Literal) { unsigned StringLength = 0; llvm::StringMapEntry &Entry = - GetConstantStringEntry(NSConstantStringMap, Literal, StringLength); + GetConstantStringEntry(NSConstantStringMap, Literal, StringLength); if (auto *C = Entry.second) - return ConstantAddress( - C, C->getValueType(), CharUnits::fromQuantity(C->getAlignment())); + return ConstantAddress(C, C->getValueType(), + CharUnits::fromQuantity(C->getAlignment())); // If we don't already have it, get _NSConstantStringClassReference. llvm::Constant *Class = getNSConstantStringClassRef(); @@ -2008,7 +1955,7 @@ CGObjCCommonMac::GenerateConstantNSString(const StringLiteral *Literal) { // String pointer. llvm::Constant *C = - llvm::ConstantDataArray::getString(VMContext, Entry.first()); + llvm::ConstantDataArray::getString(VMContext, Entry.first()); llvm::GlobalValue::LinkageTypes Linkage = llvm::GlobalValue::PrivateLinkage; bool isConstant = !CGM.getLangOpts().WritableStrings; @@ -2041,30 +1988,22 @@ CGObjCCommonMac::GenerateConstantNSString(const StringLiteral *Literal) { return ConstantAddress(GV, GV->getValueType(), Alignment); } -enum { - kCFTaggedObjectID_Integer = (1 << 1) + 1 -}; +enum { kCFTaggedObjectID_Integer = (1 << 1) + 1 }; /// Generates a message send where the super is the receiver. This is /// a message send to self with special delivery semantics indicating /// which class's method should be called. -CodeGen::RValue -CGObjCMac::GenerateMessageSendSuper(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot Return, - QualType ResultType, - Selector Sel, - const ObjCInterfaceDecl *Class, - bool isCategoryImpl, - llvm::Value *Receiver, - bool IsClassMessage, - const CodeGen::CallArgList &CallArgs, - const ObjCMethodDecl *Method) { +CodeGen::RValue CGObjCMac::GenerateMessageSendSuper( + CodeGen::CodeGenFunction &CGF, ReturnValueSlot Return, QualType ResultType, + Selector Sel, const ObjCInterfaceDecl *Class, bool isCategoryImpl, + llvm::Value *Receiver, bool IsClassMessage, + const CodeGen::CallArgList &CallArgs, const ObjCMethodDecl *Method) { // Create and init a super structure; this is a (receiver, class) // pair we will pass to objc_msgSendSuper. RawAddress ObjCSuper = CGF.CreateTempAlloca( ObjCTypes.SuperTy, CGF.getPointerAlign(), "objc_super"); llvm::Value *ReceiverAsObject = - CGF.Builder.CreateBitCast(Receiver, ObjCTypes.ObjectPtrTy); + CGF.Builder.CreateBitCast(Receiver, ObjCTypes.ObjectPtrTy); CGF.Builder.CreateStore(ReceiverAsObject, CGF.Builder.CreateStructGEP(ObjCSuper, 0)); @@ -2102,7 +2041,7 @@ CGObjCMac::GenerateMessageSendSuper(CodeGen::CodeGenFunction &CGF, // FIXME: We shouldn't need to do this cast, rectify the ASTContext and // ObjCTypes types. llvm::Type *ClassTy = - CGM.getTypes().ConvertType(CGF.getContext().getObjCClassType()); + CGM.getTypes().ConvertType(CGF.getContext().getObjCClassType()); Target = CGF.Builder.CreateBitCast(Target, ClassTy); CGF.Builder.CreateStore(Target, CGF.Builder.CreateStructGEP(ObjCSuper, 1)); return EmitMessageSend(CGF, Return, ResultType, Sel, ObjCSuper.getPointer(), @@ -2111,31 +2050,21 @@ CGObjCMac::GenerateMessageSendSuper(CodeGen::CodeGenFunction &CGF, } /// Generate code for a message send expression. -CodeGen::RValue CGObjCMac::GenerateMessageSend(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot Return, - QualType ResultType, - Selector Sel, - llvm::Value *Receiver, - const CallArgList &CallArgs, - const ObjCInterfaceDecl *Class, - const ObjCMethodDecl *Method) { +CodeGen::RValue CGObjCMac::GenerateMessageSend( + CodeGen::CodeGenFunction &CGF, ReturnValueSlot Return, QualType ResultType, + Selector Sel, llvm::Value *Receiver, const CallArgList &CallArgs, + const ObjCInterfaceDecl *Class, const ObjCMethodDecl *Method) { return EmitMessageSend(CGF, Return, ResultType, Sel, Receiver, CGF.getContext().getObjCIdType(), false, CallArgs, Method, Class, ObjCTypes); } -CodeGen::RValue -CGObjCCommonMac::EmitMessageSend(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot Return, - QualType ResultType, - Selector Sel, - llvm::Value *Arg0, - QualType Arg0Ty, - bool IsSuper, - const CallArgList &CallArgs, - const ObjCMethodDecl *Method, - const ObjCInterfaceDecl *ClassReceiver, - const ObjCCommonTypesHelper &ObjCTypes) { +CodeGen::RValue CGObjCCommonMac::EmitMessageSend( + CodeGen::CodeGenFunction &CGF, ReturnValueSlot Return, QualType ResultType, + Selector Sel, llvm::Value *Arg0, QualType Arg0Ty, bool IsSuper, + const CallArgList &CallArgs, const ObjCMethodDecl *Method, + const ObjCInterfaceDecl *ClassReceiver, + const ObjCCommonTypesHelper &ObjCTypes) { CodeGenTypes &Types = CGM.getTypes(); auto selTy = CGF.getContext().getObjCSelType(); llvm::Value *SelValue = llvm::UndefValue::get(Types.ConvertType(selTy)); @@ -2157,7 +2086,7 @@ CGObjCCommonMac::EmitMessageSend(CodeGen::CodeGenFunction &CGF, "Result type mismatch!"); bool ReceiverCanBeNull = - canMessageReceiverBeNull(CGF, Method, IsSuper, ClassReceiver, Arg0); + canMessageReceiverBeNull(CGF, Method, IsSuper, ClassReceiver, Arg0); bool RequiresNullCheck = false; bool RequiresSelValue = true; @@ -2170,22 +2099,23 @@ CGObjCCommonMac::EmitMessageSend(CodeGen::CodeGenFunction &CGF, // so just don't bother with setting the `_cmd` argument. RequiresSelValue = false; } else if (CGM.ReturnSlotInterferesWithArgs(MSI.CallInfo)) { - if (ReceiverCanBeNull) RequiresNullCheck = true; - Fn = (ObjCABI == 2) ? ObjCTypes.getSendStretFn2(IsSuper) - : ObjCTypes.getSendStretFn(IsSuper); + if (ReceiverCanBeNull) + RequiresNullCheck = true; + Fn = (ObjCABI == 2) ? ObjCTypes.getSendStretFn2(IsSuper) + : ObjCTypes.getSendStretFn(IsSuper); } else if (CGM.ReturnTypeUsesFPRet(ResultType)) { Fn = (ObjCABI == 2) ? ObjCTypes.getSendFpretFn2(IsSuper) - : ObjCTypes.getSendFpretFn(IsSuper); + : ObjCTypes.getSendFpretFn(IsSuper); } else if (CGM.ReturnTypeUsesFP2Ret(ResultType)) { Fn = (ObjCABI == 2) ? ObjCTypes.getSendFp2RetFn2(IsSuper) - : ObjCTypes.getSendFp2retFn(IsSuper); + : ObjCTypes.getSendFp2retFn(IsSuper); } else { // arm64 uses objc_msgSend for stret methods and yet null receiver check // must be made for it. if (ReceiverCanBeNull && CGM.ReturnTypeUsesSRet(MSI.CallInfo)) RequiresNullCheck = true; Fn = (ObjCABI == 2) ? ObjCTypes.getSendFn2(IsSuper) - : ObjCTypes.getSendFn(IsSuper); + : ObjCTypes.getSendFn(IsSuper); } // Cast function to proper signature @@ -2214,8 +2144,8 @@ CGObjCCommonMac::EmitMessageSend(CodeGen::CodeGenFunction &CGF, llvm::CallBase *CallSite; CGCallee Callee = CGCallee::forDirect(BitcastFn); - RValue rvalue = CGF.EmitCall(MSI.CallInfo, Callee, Return, ActualArgs, - &CallSite); + RValue rvalue = + CGF.EmitCall(MSI.CallInfo, Callee, Return, ActualArgs, &CallSite); // Mark the call as noreturn if the method is marked noreturn and the // receiver cannot be null. @@ -2240,13 +2170,19 @@ static Qualifiers::GC GetGCAttrTypeForType(ASTContext &Ctx, QualType FQT, if (auto ownership = FQT.getObjCLifetime()) { // Ownership does not apply recursively to C pointer types. - if (pointee) return Qualifiers::GCNone; + if (pointee) + return Qualifiers::GCNone; switch (ownership) { - case Qualifiers::OCL_Weak: return Qualifiers::Weak; - case Qualifiers::OCL_Strong: return Qualifiers::Strong; - case Qualifiers::OCL_ExplicitNone: return Qualifiers::GCNone; - case Qualifiers::OCL_Autoreleasing: llvm_unreachable("autoreleasing ivar?"); - case Qualifiers::OCL_None: llvm_unreachable("known nonzero"); + case Qualifiers::OCL_Weak: + return Qualifiers::Weak; + case Qualifiers::OCL_Strong: + return Qualifiers::Strong; + case Qualifiers::OCL_ExplicitNone: + return Qualifiers::GCNone; + case Qualifiers::OCL_Autoreleasing: + llvm_unreachable("autoreleasing ivar?"); + case Qualifiers::OCL_None: + llvm_unreachable("known nonzero"); } llvm_unreachable("bad objc ownership"); } @@ -2265,76 +2201,73 @@ static Qualifiers::GC GetGCAttrTypeForType(ASTContext &Ctx, QualType FQT, } namespace { - struct IvarInfo { - CharUnits Offset; - uint64_t SizeInWords; - IvarInfo(CharUnits offset, uint64_t sizeInWords) +struct IvarInfo { + CharUnits Offset; + uint64_t SizeInWords; + IvarInfo(CharUnits offset, uint64_t sizeInWords) : Offset(offset), SizeInWords(sizeInWords) {} - // Allow sorting based on byte pos. - bool operator<(const IvarInfo &other) const { - return Offset < other.Offset; - } - }; + // Allow sorting based on byte pos. + bool operator<(const IvarInfo &other) const { return Offset < other.Offset; } +}; - /// A helper class for building GC layout strings. - class IvarLayoutBuilder { - CodeGenModule &CGM; +/// A helper class for building GC layout strings. +class IvarLayoutBuilder { + CodeGenModule &CGM; - /// The start of the layout. Offsets will be relative to this value, - /// and entries less than this value will be silently discarded. - CharUnits InstanceBegin; + /// The start of the layout. Offsets will be relative to this value, + /// and entries less than this value will be silently discarded. + CharUnits InstanceBegin; - /// The end of the layout. Offsets will never exceed this value. - CharUnits InstanceEnd; + /// The end of the layout. Offsets will never exceed this value. + CharUnits InstanceEnd; - /// Whether we're generating the strong layout or the weak layout. - bool ForStrongLayout; + /// Whether we're generating the strong layout or the weak layout. + bool ForStrongLayout; - /// Whether the offsets in IvarsInfo might be out-of-order. - bool IsDisordered = false; + /// Whether the offsets in IvarsInfo might be out-of-order. + bool IsDisordered = false; - llvm::SmallVector IvarsInfo; + llvm::SmallVector IvarsInfo; - public: - IvarLayoutBuilder(CodeGenModule &CGM, CharUnits instanceBegin, - CharUnits instanceEnd, bool forStrongLayout) +public: + IvarLayoutBuilder(CodeGenModule &CGM, CharUnits instanceBegin, + CharUnits instanceEnd, bool forStrongLayout) : CGM(CGM), InstanceBegin(instanceBegin), InstanceEnd(instanceEnd), - ForStrongLayout(forStrongLayout) { - } + ForStrongLayout(forStrongLayout) {} - void visitRecord(const RecordType *RT, CharUnits offset); + void visitRecord(const RecordType *RT, CharUnits offset); - template - void visitAggregate(Iterator begin, Iterator end, - CharUnits aggrOffset, - const GetOffsetFn &getOffset); + template + void visitAggregate(Iterator begin, Iterator end, CharUnits aggrOffset, + const GetOffsetFn &getOffset); - void visitField(const FieldDecl *field, CharUnits offset); + void visitField(const FieldDecl *field, CharUnits offset); - /// Add the layout of a block implementation. - void visitBlock(const CGBlockInfo &blockInfo); + /// Add the layout of a block implementation. + void visitBlock(const CGBlockInfo &blockInfo); - /// Is there any information for an interesting bitmap? - bool hasBitmapData() const { return !IvarsInfo.empty(); } + /// Is there any information for an interesting bitmap? + bool hasBitmapData() const { return !IvarsInfo.empty(); } - llvm::Constant *buildBitmap(CGObjCCommonMac &CGObjC, - llvm::SmallVectorImpl &buffer); + llvm::Constant *buildBitmap(CGObjCCommonMac &CGObjC, + llvm::SmallVectorImpl &buffer); - static void dump(ArrayRef buffer) { - const unsigned char *s = buffer.data(); - for (unsigned i = 0, e = buffer.size(); i < e; i++) - if (!(s[i] & 0xf0)) - printf("0x0%x%s", s[i], s[i] != 0 ? ", " : ""); - else - printf("0x%x%s", s[i], s[i] != 0 ? ", " : ""); - printf("\n"); - } - }; + static void dump(ArrayRef buffer) { + const unsigned char *s = buffer.data(); + for (unsigned i = 0, e = buffer.size(); i < e; i++) + if (!(s[i] & 0xf0)) + printf("0x0%x%s", s[i], s[i] != 0 ? ", " : ""); + else + printf("0x%x%s", s[i], s[i] != 0 ? ", " : ""); + printf("\n"); + } +}; } // end anonymous namespace -llvm::Constant *CGObjCCommonMac::BuildGCBlockLayout(CodeGenModule &CGM, - const CGBlockInfo &blockInfo) { +llvm::Constant * +CGObjCCommonMac::BuildGCBlockLayout(CodeGenModule &CGM, + const CGBlockInfo &blockInfo) { llvm::Constant *nullPtr = llvm::Constant::getNullValue(CGM.Int8PtrTy); if (CGM.getLangOpts().getGC() == LangOptions::NonGC) @@ -2378,7 +2311,8 @@ void IvarLayoutBuilder::visitBlock(const CGBlockInfo &blockInfo) { const CGBlockInfo::Capture &capture = blockInfo.getCapture(variable); // Ignore constant captures. - if (capture.isConstant()) continue; + if (capture.isConstant()) + continue; CharUnits fieldOffset = capture.getOffset(); @@ -2413,8 +2347,8 @@ void IvarLayoutBuilder::visitBlock(const CGBlockInfo &blockInfo) { /// getBlockCaptureLifetime - This routine returns life time of the captured /// block variable for the purpose of block layout meta-data generation. FQT is /// the type of the variable captured in the block. -Qualifiers::ObjCLifetime CGObjCCommonMac::getBlockCaptureLifetime(QualType FQT, - bool ByrefLayout) { +Qualifiers::ObjCLifetime +CGObjCCommonMac::getBlockCaptureLifetime(QualType FQT, bool ByrefLayout) { // If it has an ownership qualifier, we're done. if (auto lifetime = FQT.getObjCLifetime()) return lifetime; @@ -2436,26 +2370,25 @@ void CGObjCCommonMac::UpdateRunSkipBlockVars(bool IsByref, CharUnits FieldSize) { // __block variables are passed by their descriptor address. if (IsByref) - RunSkipBlockVars.push_back(RUN_SKIP(BLOCK_LAYOUT_BYREF, FieldOffset, - FieldSize)); + RunSkipBlockVars.push_back( + RUN_SKIP(BLOCK_LAYOUT_BYREF, FieldOffset, FieldSize)); else if (LifeTime == Qualifiers::OCL_Strong) - RunSkipBlockVars.push_back(RUN_SKIP(BLOCK_LAYOUT_STRONG, FieldOffset, - FieldSize)); + RunSkipBlockVars.push_back( + RUN_SKIP(BLOCK_LAYOUT_STRONG, FieldOffset, FieldSize)); else if (LifeTime == Qualifiers::OCL_Weak) - RunSkipBlockVars.push_back(RUN_SKIP(BLOCK_LAYOUT_WEAK, FieldOffset, - FieldSize)); + RunSkipBlockVars.push_back( + RUN_SKIP(BLOCK_LAYOUT_WEAK, FieldOffset, FieldSize)); else if (LifeTime == Qualifiers::OCL_ExplicitNone) - RunSkipBlockVars.push_back(RUN_SKIP(BLOCK_LAYOUT_UNRETAINED, FieldOffset, - FieldSize)); + RunSkipBlockVars.push_back( + RUN_SKIP(BLOCK_LAYOUT_UNRETAINED, FieldOffset, FieldSize)); else - RunSkipBlockVars.push_back(RUN_SKIP(BLOCK_LAYOUT_NON_OBJECT_BYTES, - FieldOffset, - FieldSize)); + RunSkipBlockVars.push_back( + RUN_SKIP(BLOCK_LAYOUT_NON_OBJECT_BYTES, FieldOffset, FieldSize)); } void CGObjCCommonMac::BuildRCRecordLayout(const llvm::StructLayout *RecLayout, const RecordDecl *RD, - ArrayRef RecFields, + ArrayRef RecFields, CharUnits BytePos, bool &HasUnion, bool ByrefLayout) { bool IsUnion = (RD && RD->isUnion()); @@ -2475,7 +2408,7 @@ void CGObjCCommonMac::BuildRCRecordLayout(const llvm::StructLayout *RecLayout, // although this dependency is hidden. const ASTRecordLayout &RL = CGM.getContext().getASTRecordLayout(RD); CharUnits FieldOffset = - CGM.getContext().toCharUnitsFromBits(RL.getFieldOffset(i)); + CGM.getContext().toCharUnitsFromBits(RL.getFieldOffset(i)); // Skip over unnamed or bitfields if (!Field->getIdentifier() || Field->isBitField()) { @@ -2513,13 +2446,14 @@ void CGObjCCommonMac::BuildRCRecordLayout(const llvm::StructLayout *RecLayout, // Replicate layout information for each array element. Note that // one element is already done. uint64_t ElIx = 1; - for (int FirstIndex = RunSkipBlockVars.size() - 1 ;ElIx < ElCount; ElIx++) { + for (int FirstIndex = RunSkipBlockVars.size() - 1; ElIx < ElCount; + ElIx++) { CharUnits Size = CGM.getContext().getTypeSizeInChars(RT); - for (int i = OldIndex+1; i <= FirstIndex; ++i) + for (int i = OldIndex + 1; i <= FirstIndex; ++i) RunSkipBlockVars.push_back( - RUN_SKIP(RunSkipBlockVars[i].opcode, - RunSkipBlockVars[i].block_var_bytepos + Size*ElIx, - RunSkipBlockVars[i].block_var_size)); + RUN_SKIP(RunSkipBlockVars[i].opcode, + RunSkipBlockVars[i].block_var_bytepos + Size * ElIx, + RunSkipBlockVars[i].block_var_size)); } continue; } @@ -2533,10 +2467,8 @@ void CGObjCCommonMac::BuildRCRecordLayout(const llvm::StructLayout *RecLayout, MaxFieldOffset = FieldOffset; } } else { - UpdateRunSkipBlockVars(false, - getBlockCaptureLifetime(FQT, ByrefLayout), - BytePos + FieldOffset, - FieldSize); + UpdateRunSkipBlockVars(false, getBlockCaptureLifetime(FQT, ByrefLayout), + BytePos + FieldOffset, FieldSize); } } @@ -2545,32 +2477,32 @@ void CGObjCCommonMac::BuildRCRecordLayout(const llvm::StructLayout *RecLayout, // Last field was a bitfield. Must update the info. uint64_t BitFieldSize = LastFieldBitfieldOrUnnamed->getBitWidthValue(); unsigned UnsSize = (BitFieldSize / ByteSizeInBits) + - ((BitFieldSize % ByteSizeInBits) != 0); + ((BitFieldSize % ByteSizeInBits) != 0); CharUnits Size = CharUnits::fromQuantity(UnsSize); Size += LastBitfieldOrUnnamedOffset; - UpdateRunSkipBlockVars(false, - getBlockCaptureLifetime(LastFieldBitfieldOrUnnamed->getType(), - ByrefLayout), - BytePos + LastBitfieldOrUnnamedOffset, - Size); + UpdateRunSkipBlockVars( + false, + getBlockCaptureLifetime(LastFieldBitfieldOrUnnamed->getType(), + ByrefLayout), + BytePos + LastBitfieldOrUnnamedOffset, Size); } else { - assert(!LastFieldBitfieldOrUnnamed->getIdentifier() &&"Expected unnamed"); + assert(!LastFieldBitfieldOrUnnamed->getIdentifier() && + "Expected unnamed"); // Last field was unnamed. Must update skip info. - CharUnits FieldSize - = CGM.getContext().getTypeSizeInChars(LastFieldBitfieldOrUnnamed->getType()); - UpdateRunSkipBlockVars(false, - getBlockCaptureLifetime(LastFieldBitfieldOrUnnamed->getType(), - ByrefLayout), - BytePos + LastBitfieldOrUnnamedOffset, - FieldSize); + CharUnits FieldSize = CGM.getContext().getTypeSizeInChars( + LastFieldBitfieldOrUnnamed->getType()); + UpdateRunSkipBlockVars( + false, + getBlockCaptureLifetime(LastFieldBitfieldOrUnnamed->getType(), + ByrefLayout), + BytePos + LastBitfieldOrUnnamedOffset, FieldSize); } } if (MaxField) - UpdateRunSkipBlockVars(false, - getBlockCaptureLifetime(MaxField->getType(), ByrefLayout), - BytePos + MaxFieldOffset, - MaxUnionSize); + UpdateRunSkipBlockVars( + false, getBlockCaptureLifetime(MaxField->getType(), ByrefLayout), + BytePos + MaxFieldOffset, MaxUnionSize); } void CGObjCCommonMac::BuildRCBlockVarRecordLayout(const RecordType *RT, @@ -2578,105 +2510,104 @@ void CGObjCCommonMac::BuildRCBlockVarRecordLayout(const RecordType *RT, bool &HasUnion, bool ByrefLayout) { const RecordDecl *RD = RT->getDecl(); - SmallVector Fields(RD->fields()); + SmallVector Fields(RD->fields()); llvm::Type *Ty = CGM.getTypes().ConvertType(QualType(RT, 0)); const llvm::StructLayout *RecLayout = - CGM.getDataLayout().getStructLayout(cast(Ty)); + CGM.getDataLayout().getStructLayout(cast(Ty)); BuildRCRecordLayout(RecLayout, RD, Fields, BytePos, HasUnion, ByrefLayout); } /// InlineLayoutInstruction - This routine produce an inline instruction for the /// block variable layout if it can. If not, it returns 0. Rules are as follow: -/// If ((uintptr_t) layout) < (1 << 12), the layout is inline. In the 64bit world, -/// an inline layout of value 0x0000000000000xyz is interpreted as follows: -/// x captured object pointers of BLOCK_LAYOUT_STRONG. Followed by -/// y captured object of BLOCK_LAYOUT_BYREF. Followed by -/// z captured object of BLOCK_LAYOUT_WEAK. If any of the above is missing, zero -/// replaces it. For example, 0x00000x00 means x BLOCK_LAYOUT_STRONG and no -/// BLOCK_LAYOUT_BYREF and no BLOCK_LAYOUT_WEAK objects are captured. +/// If ((uintptr_t) layout) < (1 << 12), the layout is inline. In the 64bit +/// world, an inline layout of value 0x0000000000000xyz is interpreted as +/// follows: x captured object pointers of BLOCK_LAYOUT_STRONG. Followed by y +/// captured object of BLOCK_LAYOUT_BYREF. Followed by z captured object of +/// BLOCK_LAYOUT_WEAK. If any of the above is missing, zero replaces it. For +/// example, 0x00000x00 means x BLOCK_LAYOUT_STRONG and no BLOCK_LAYOUT_BYREF +/// and no BLOCK_LAYOUT_WEAK objects are captured. uint64_t CGObjCCommonMac::InlineLayoutInstruction( - SmallVectorImpl &Layout) { + SmallVectorImpl &Layout) { uint64_t Result = 0; if (Layout.size() <= 3) { unsigned size = Layout.size(); - unsigned strong_word_count = 0, byref_word_count=0, weak_word_count=0; + unsigned strong_word_count = 0, byref_word_count = 0, weak_word_count = 0; unsigned char inst; - enum BLOCK_LAYOUT_OPCODE opcode ; + enum BLOCK_LAYOUT_OPCODE opcode; switch (size) { - case 3: - inst = Layout[0]; - opcode = (enum BLOCK_LAYOUT_OPCODE) (inst >> 4); - if (opcode == BLOCK_LAYOUT_STRONG) - strong_word_count = (inst & 0xF)+1; - else - return 0; + case 3: + inst = Layout[0]; + opcode = (enum BLOCK_LAYOUT_OPCODE)(inst >> 4); + if (opcode == BLOCK_LAYOUT_STRONG) + strong_word_count = (inst & 0xF) + 1; + else + return 0; + inst = Layout[1]; + opcode = (enum BLOCK_LAYOUT_OPCODE)(inst >> 4); + if (opcode == BLOCK_LAYOUT_BYREF) + byref_word_count = (inst & 0xF) + 1; + else + return 0; + inst = Layout[2]; + opcode = (enum BLOCK_LAYOUT_OPCODE)(inst >> 4); + if (opcode == BLOCK_LAYOUT_WEAK) + weak_word_count = (inst & 0xF) + 1; + else + return 0; + break; + + case 2: + inst = Layout[0]; + opcode = (enum BLOCK_LAYOUT_OPCODE)(inst >> 4); + if (opcode == BLOCK_LAYOUT_STRONG) { + strong_word_count = (inst & 0xF) + 1; inst = Layout[1]; - opcode = (enum BLOCK_LAYOUT_OPCODE) (inst >> 4); + opcode = (enum BLOCK_LAYOUT_OPCODE)(inst >> 4); if (opcode == BLOCK_LAYOUT_BYREF) - byref_word_count = (inst & 0xF)+1; + byref_word_count = (inst & 0xF) + 1; + else if (opcode == BLOCK_LAYOUT_WEAK) + weak_word_count = (inst & 0xF) + 1; else return 0; - inst = Layout[2]; - opcode = (enum BLOCK_LAYOUT_OPCODE) (inst >> 4); + } else if (opcode == BLOCK_LAYOUT_BYREF) { + byref_word_count = (inst & 0xF) + 1; + inst = Layout[1]; + opcode = (enum BLOCK_LAYOUT_OPCODE)(inst >> 4); if (opcode == BLOCK_LAYOUT_WEAK) - weak_word_count = (inst & 0xF)+1; - else - return 0; - break; - - case 2: - inst = Layout[0]; - opcode = (enum BLOCK_LAYOUT_OPCODE) (inst >> 4); - if (opcode == BLOCK_LAYOUT_STRONG) { - strong_word_count = (inst & 0xF)+1; - inst = Layout[1]; - opcode = (enum BLOCK_LAYOUT_OPCODE) (inst >> 4); - if (opcode == BLOCK_LAYOUT_BYREF) - byref_word_count = (inst & 0xF)+1; - else if (opcode == BLOCK_LAYOUT_WEAK) - weak_word_count = (inst & 0xF)+1; - else - return 0; - } - else if (opcode == BLOCK_LAYOUT_BYREF) { - byref_word_count = (inst & 0xF)+1; - inst = Layout[1]; - opcode = (enum BLOCK_LAYOUT_OPCODE) (inst >> 4); - if (opcode == BLOCK_LAYOUT_WEAK) - weak_word_count = (inst & 0xF)+1; - else - return 0; - } - else - return 0; - break; - - case 1: - inst = Layout[0]; - opcode = (enum BLOCK_LAYOUT_OPCODE) (inst >> 4); - if (opcode == BLOCK_LAYOUT_STRONG) - strong_word_count = (inst & 0xF)+1; - else if (opcode == BLOCK_LAYOUT_BYREF) - byref_word_count = (inst & 0xF)+1; - else if (opcode == BLOCK_LAYOUT_WEAK) - weak_word_count = (inst & 0xF)+1; + weak_word_count = (inst & 0xF) + 1; else return 0; - break; + } else + return 0; + break; - default: + case 1: + inst = Layout[0]; + opcode = (enum BLOCK_LAYOUT_OPCODE)(inst >> 4); + if (opcode == BLOCK_LAYOUT_STRONG) + strong_word_count = (inst & 0xF) + 1; + else if (opcode == BLOCK_LAYOUT_BYREF) + byref_word_count = (inst & 0xF) + 1; + else if (opcode == BLOCK_LAYOUT_WEAK) + weak_word_count = (inst & 0xF) + 1; + else return 0; + break; + + default: + return 0; } // Cannot inline when any of the word counts is 15. Because this is one less // than the actual work count (so 15 means 16 actual word counts), // and we can only display 0 thru 15 word counts. - if (strong_word_count == 16 || byref_word_count == 16 || weak_word_count == 16) + if (strong_word_count == 16 || byref_word_count == 16 || + weak_word_count == 16) return 0; - unsigned count = - (strong_word_count != 0) + (byref_word_count != 0) + (weak_word_count != 0); + unsigned count = (strong_word_count != 0) + (byref_word_count != 0) + + (weak_word_count != 0); if (size == count) { if (strong_word_count) @@ -2698,7 +2629,7 @@ llvm::Constant *CGObjCCommonMac::getBitmapBlockLayout(bool ComputeByrefLayout) { return nullPtr; unsigned WordSizeInBits = CGM.getTarget().getPointerWidth(LangAS::Default); unsigned ByteSizeInBits = CGM.getTarget().getCharWidth(); - unsigned WordSizeInBytes = WordSizeInBits/ByteSizeInBits; + unsigned WordSizeInBytes = WordSizeInBits / ByteSizeInBits; // Sort on byte position; captures might not be allocated in order, // and unions can do funny things. @@ -2710,21 +2641,20 @@ llvm::Constant *CGObjCCommonMac::getBitmapBlockLayout(bool ComputeByrefLayout) { enum BLOCK_LAYOUT_OPCODE opcode = RunSkipBlockVars[i].opcode; CharUnits start_byte_pos = RunSkipBlockVars[i].block_var_bytepos; CharUnits end_byte_pos = start_byte_pos; - unsigned j = i+1; + unsigned j = i + 1; while (j < size) { if (opcode == RunSkipBlockVars[j].opcode) { end_byte_pos = RunSkipBlockVars[j++].block_var_bytepos; i++; - } - else + } else break; } CharUnits size_in_bytes = - end_byte_pos - start_byte_pos + RunSkipBlockVars[j-1].block_var_size; + end_byte_pos - start_byte_pos + RunSkipBlockVars[j - 1].block_var_size; if (j < size) { - CharUnits gap = - RunSkipBlockVars[j].block_var_bytepos - - RunSkipBlockVars[j-1].block_var_bytepos - RunSkipBlockVars[j-1].block_var_size; + CharUnits gap = RunSkipBlockVars[j].block_var_bytepos - + RunSkipBlockVars[j - 1].block_var_bytepos - + RunSkipBlockVars[j - 1].block_var_size; size_in_bytes += gap; } CharUnits residue_in_bytes = CharUnits::Zero(); @@ -2745,20 +2675,21 @@ llvm::Constant *CGObjCCommonMac::getBitmapBlockLayout(bool ComputeByrefLayout) { if (size_in_words > 0) { // Note that value in imm. is one less that the actual // value. So, we subtract 1 away! - unsigned char inst = (opcode << 4) | (size_in_words-1); + unsigned char inst = (opcode << 4) | (size_in_words - 1); Layout.push_back(inst); } if (residue_in_bytes > CharUnits::Zero()) { - unsigned char inst = - (BLOCK_LAYOUT_NON_OBJECT_BYTES << 4) | (residue_in_bytes.getQuantity()-1); + unsigned char inst = (BLOCK_LAYOUT_NON_OBJECT_BYTES << 4) | + (residue_in_bytes.getQuantity() - 1); Layout.push_back(inst); } } while (!Layout.empty()) { unsigned char inst = Layout.back(); - enum BLOCK_LAYOUT_OPCODE opcode = (enum BLOCK_LAYOUT_OPCODE) (inst >> 4); - if (opcode == BLOCK_LAYOUT_NON_OBJECT_BYTES || opcode == BLOCK_LAYOUT_NON_OBJECT_WORDS) + enum BLOCK_LAYOUT_OPCODE opcode = (enum BLOCK_LAYOUT_OPCODE)(inst >> 4); + if (opcode == BLOCK_LAYOUT_NON_OBJECT_BYTES || + opcode == BLOCK_LAYOUT_NON_OBJECT_WORDS) Layout.pop_back(); else break; @@ -2774,11 +2705,11 @@ llvm::Constant *CGObjCCommonMac::getBitmapBlockLayout(bool ComputeByrefLayout) { printf("\n Inline block variable layout: "); printf("0x0%" PRIx64 "", Result); if (auto numStrong = (Result & 0xF00) >> 8) - printf(", BL_STRONG:%d", (int) numStrong); + printf(", BL_STRONG:%d", (int)numStrong); if (auto numByref = (Result & 0x0F0) >> 4) - printf(", BL_BYREF:%d", (int) numByref); + printf(", BL_BYREF:%d", (int)numByref); if (auto numWeak = (Result & 0x00F) >> 0) - printf(", BL_WEAK:%d", (int) numWeak); + printf(", BL_WEAK:%d", (int)numWeak); printf(", BL_OPERATOR:0\n"); } return llvm::ConstantInt::get(CGM.IntPtrTy, Result); @@ -2797,36 +2728,36 @@ llvm::Constant *CGObjCCommonMac::getBitmapBlockLayout(bool ComputeByrefLayout) { printf("\n Block variable layout: "); for (unsigned i = 0, e = BitMap.size(); i != e; i++) { unsigned char inst = BitMap[i]; - enum BLOCK_LAYOUT_OPCODE opcode = (enum BLOCK_LAYOUT_OPCODE) (inst >> 4); + enum BLOCK_LAYOUT_OPCODE opcode = (enum BLOCK_LAYOUT_OPCODE)(inst >> 4); unsigned delta = 1; switch (opcode) { - case BLOCK_LAYOUT_OPERATOR: - printf("BL_OPERATOR:"); - delta = 0; - break; - case BLOCK_LAYOUT_NON_OBJECT_BYTES: - printf("BL_NON_OBJECT_BYTES:"); - break; - case BLOCK_LAYOUT_NON_OBJECT_WORDS: - printf("BL_NON_OBJECT_WORD:"); - break; - case BLOCK_LAYOUT_STRONG: - printf("BL_STRONG:"); - break; - case BLOCK_LAYOUT_BYREF: - printf("BL_BYREF:"); - break; - case BLOCK_LAYOUT_WEAK: - printf("BL_WEAK:"); - break; - case BLOCK_LAYOUT_UNRETAINED: - printf("BL_UNRETAINED:"); - break; + case BLOCK_LAYOUT_OPERATOR: + printf("BL_OPERATOR:"); + delta = 0; + break; + case BLOCK_LAYOUT_NON_OBJECT_BYTES: + printf("BL_NON_OBJECT_BYTES:"); + break; + case BLOCK_LAYOUT_NON_OBJECT_WORDS: + printf("BL_NON_OBJECT_WORD:"); + break; + case BLOCK_LAYOUT_STRONG: + printf("BL_STRONG:"); + break; + case BLOCK_LAYOUT_BYREF: + printf("BL_BYREF:"); + break; + case BLOCK_LAYOUT_WEAK: + printf("BL_WEAK:"); + break; + case BLOCK_LAYOUT_UNRETAINED: + printf("BL_UNRETAINED:"); + break; } // Actual value of word count is one more that what is in the imm. // field of the instruction printf("%d", (inst & 0xf) + delta); - if (i < e-1) + if (i < e - 1) printf(", "); else printf("\n"); @@ -2884,13 +2815,13 @@ void CGObjCCommonMac::fillRunSkipBlockVars(CodeGenModule &CGM, unsigned WordSizeInBits = CGM.getTarget().getPointerWidth(LangAS::Default); unsigned ByteSizeInBits = CGM.getTarget().getCharWidth(); - unsigned WordSizeInBytes = WordSizeInBits/ByteSizeInBits; + unsigned WordSizeInBytes = WordSizeInBits / ByteSizeInBits; const BlockDecl *blockDecl = blockInfo.getBlockDecl(); // Calculate the basic layout of the block structure. const llvm::StructLayout *layout = - CGM.getDataLayout().getStructLayout(blockInfo.StructureType); + CGM.getDataLayout().getStructLayout(blockInfo.StructureType); // Ignore the optional 'this' capture: C++ objects are not assumed // to be GC'ed. @@ -2906,10 +2837,11 @@ void CGObjCCommonMac::fillRunSkipBlockVars(CodeGenModule &CGM, const CGBlockInfo::Capture &capture = blockInfo.getCapture(variable); // Ignore constant captures. - if (capture.isConstant()) continue; + if (capture.isConstant()) + continue; CharUnits fieldOffset = - CharUnits::fromQuantity(layout->getElementOffset(capture.getIndex())); + CharUnits::fromQuantity(layout->getElementOffset(capture.getIndex())); assert(!type->isArrayType() && "array variable should not be caught"); if (!CI.isByRef()) @@ -2948,7 +2880,8 @@ llvm::Constant *CGObjCCommonMac::BuildByrefLayout(CodeGen::CodeGenModule &CGM, RunSkipBlockVars.clear(); bool hasUnion = false; if (const RecordType *record = T->getAs()) { - BuildRCBlockVarRecordLayout(record, fieldOffset, hasUnion, true /*ByrefLayout */); + BuildRCBlockVarRecordLayout(record, fieldOffset, hasUnion, + true /*ByrefLayout */); llvm::Constant *Result = getBitmapBlockLayout(true); if (isa(Result)) Result = llvm::ConstantExpr::getIntToPtr(Result, CGM.Int8PtrTy); @@ -2986,10 +2919,10 @@ llvm::Constant *CGObjCCommonMac::GetProtocolRef(const ObjCProtocolDecl *PD) { return GetOrEmitProtocolRef(PD); } -llvm::Value *CGObjCCommonMac::EmitClassRefViaRuntime( - CodeGenFunction &CGF, - const ObjCInterfaceDecl *ID, - ObjCCommonTypesHelper &ObjCTypes) { +llvm::Value * +CGObjCCommonMac::EmitClassRefViaRuntime(CodeGenFunction &CGF, + const ObjCInterfaceDecl *ID, + ObjCCommonTypesHelper &ObjCTypes) { llvm::FunctionCallee lookUpClassFn = ObjCTypes.getLookUpClassFn(); llvm::Value *className = CGF.CGM @@ -2997,10 +2930,8 @@ llvm::Value *CGObjCCommonMac::EmitClassRefViaRuntime( ID->getObjCRuntimeNameAsString())) .getPointer(); ASTContext &ctx = CGF.CGM.getContext(); - className = - CGF.Builder.CreateBitCast(className, - CGF.ConvertType( - ctx.getPointerType(ctx.CharTy.withConst()))); + className = CGF.Builder.CreateBitCast( + className, CGF.ConvertType(ctx.getPointerType(ctx.CharTy.withConst()))); llvm::CallInst *call = CGF.Builder.CreateCall(lookUpClassFn, className); call->setDoesNotThrow(); return call; @@ -3042,20 +2973,19 @@ llvm::Constant *CGObjCMac::GetOrEmitProtocol(const ObjCProtocolDecl *PD) { values.add(GetClassName(PD->getObjCRuntimeNameAsString())); values.add(EmitProtocolList("OBJC_PROTOCOL_REFS_" + PD->getName(), PD->protocol_begin(), PD->protocol_end())); - values.add(methodLists.emitMethodList(this, PD, - ProtocolMethodLists::RequiredInstanceMethods)); - values.add(methodLists.emitMethodList(this, PD, - ProtocolMethodLists::RequiredClassMethods)); + values.add(methodLists.emitMethodList( + this, PD, ProtocolMethodLists::RequiredInstanceMethods)); + values.add(methodLists.emitMethodList( + this, PD, ProtocolMethodLists::RequiredClassMethods)); if (Entry) { // Already created, update the initializer. assert(Entry->hasPrivateLinkage()); values.finishAndSetAsInitializer(Entry); } else { - Entry = values.finishAndCreateGlobal("OBJC_PROTOCOL_" + PD->getName(), - CGM.getPointerAlign(), - /*constant*/ false, - llvm::GlobalValue::PrivateLinkage); + Entry = values.finishAndCreateGlobal( + "OBJC_PROTOCOL_" + PD->getName(), CGM.getPointerAlign(), + /*constant*/ false, llvm::GlobalValue::PrivateLinkage); Entry->setSection("__OBJC,__protocol,regular,no_dead_strip"); Protocols[PD->getIdentifier()] = Entry; @@ -3096,36 +3026,30 @@ llvm::Constant *CGObjCMac::GetOrEmitProtocolRef(const ObjCProtocolDecl *PD) { llvm::Constant * CGObjCMac::EmitProtocolExtension(const ObjCProtocolDecl *PD, const ProtocolMethodLists &methodLists) { - auto optInstanceMethods = - methodLists.emitMethodList(this, PD, - ProtocolMethodLists::OptionalInstanceMethods); - auto optClassMethods = - methodLists.emitMethodList(this, PD, - ProtocolMethodLists::OptionalClassMethods); - - auto extendedMethodTypes = - EmitProtocolMethodTypes("OBJC_PROTOCOL_METHOD_TYPES_" + PD->getName(), - methodLists.emitExtendedTypesArray(this), - ObjCTypes); - - auto instanceProperties = - EmitPropertyList("OBJC_$_PROP_PROTO_LIST_" + PD->getName(), nullptr, PD, - ObjCTypes, false); + auto optInstanceMethods = methodLists.emitMethodList( + this, PD, ProtocolMethodLists::OptionalInstanceMethods); + auto optClassMethods = methodLists.emitMethodList( + this, PD, ProtocolMethodLists::OptionalClassMethods); + + auto extendedMethodTypes = EmitProtocolMethodTypes( + "OBJC_PROTOCOL_METHOD_TYPES_" + PD->getName(), + methodLists.emitExtendedTypesArray(this), ObjCTypes); + + auto instanceProperties = EmitPropertyList( + "OBJC_$_PROP_PROTO_LIST_" + PD->getName(), nullptr, PD, ObjCTypes, false); auto classProperties = - EmitPropertyList("OBJC_$_CLASS_PROP_PROTO_LIST_" + PD->getName(), nullptr, - PD, ObjCTypes, true); + EmitPropertyList("OBJC_$_CLASS_PROP_PROTO_LIST_" + PD->getName(), nullptr, + PD, ObjCTypes, true); // Return null if no extension bits are used. - if (optInstanceMethods->isNullValue() && - optClassMethods->isNullValue() && - extendedMethodTypes->isNullValue() && - instanceProperties->isNullValue() && + if (optInstanceMethods->isNullValue() && optClassMethods->isNullValue() && + extendedMethodTypes->isNullValue() && instanceProperties->isNullValue() && classProperties->isNullValue()) { return llvm::Constant::getNullValue(ObjCTypes.ProtocolExtensionPtrTy); } uint64_t size = - CGM.getDataLayout().getTypeAllocSize(ObjCTypes.ProtocolExtensionTy); + CGM.getDataLayout().getTypeAllocSize(ObjCTypes.ProtocolExtensionTy); ConstantInitBuilder builder(CGM); auto values = builder.beginStruct(ObjCTypes.ProtocolExtensionTy); @@ -3187,11 +3111,10 @@ CGObjCMac::EmitProtocolList(Twine name, return GV; } -static void -PushProtocolProperties(llvm::SmallPtrSet &PropertySet, - SmallVectorImpl &Properties, - const ObjCProtocolDecl *Proto, - bool IsClassProperty) { +static void PushProtocolProperties( + llvm::SmallPtrSet &PropertySet, + SmallVectorImpl &Properties, + const ObjCProtocolDecl *Proto, bool IsClassProperty) { for (const auto *PD : Proto->properties()) { if (IsClassProperty != PD->isClassProperty()) continue; @@ -3216,11 +3139,9 @@ PushProtocolProperties(llvm::SmallPtrSet &PropertySet, struct _objc_property[prop_count]; }; */ -llvm::Constant *CGObjCCommonMac::EmitPropertyList(Twine Name, - const Decl *Container, - const ObjCContainerDecl *OCD, - const ObjCCommonTypesHelper &ObjCTypes, - bool IsClassProperty) { +llvm::Constant *CGObjCCommonMac::EmitPropertyList( + Twine Name, const Decl *Container, const ObjCContainerDecl *OCD, + const ObjCCommonTypesHelper &ObjCTypes, bool IsClassProperty) { if (IsClassProperty) { // Make this entry NULL for OS X with deployment target < 10.11, for iOS // with deployment target < 9.0. @@ -3231,7 +3152,7 @@ llvm::Constant *CGObjCCommonMac::EmitPropertyList(Twine Name, } SmallVector Properties; - llvm::SmallPtrSet PropertySet; + llvm::SmallPtrSet PropertySet; if (const ObjCInterfaceDecl *OID = dyn_cast(OCD)) for (const ObjCCategoryDecl *ClassExt : OID->known_extensions()) @@ -3259,8 +3180,7 @@ llvm::Constant *CGObjCCommonMac::EmitPropertyList(Twine Name, if (const ObjCInterfaceDecl *OID = dyn_cast(OCD)) { for (const auto *P : OID->all_referenced_protocols()) PushProtocolProperties(PropertySet, Properties, P, IsClassProperty); - } - else if (const ObjCCategoryDecl *CD = dyn_cast(OCD)) { + } else if (const ObjCCategoryDecl *CD = dyn_cast(OCD)) { for (const auto *P : CD->protocols()) PushProtocolProperties(PropertySet, Properties, P, IsClassProperty); } @@ -3270,7 +3190,7 @@ llvm::Constant *CGObjCCommonMac::EmitPropertyList(Twine Name, return llvm::Constant::getNullValue(ObjCTypes.PropertyListPtrTy); unsigned propertySize = - CGM.getDataLayout().getTypeAllocSize(ObjCTypes.PropertyTy); + CGM.getDataLayout().getTypeAllocSize(ObjCTypes.PropertyTy); ConstantInitBuilder builder(CGM); auto values = builder.beginStruct(); @@ -3295,16 +3215,15 @@ llvm::Constant *CGObjCCommonMac::EmitPropertyList(Twine Name, return GV; } -llvm::Constant * -CGObjCCommonMac::EmitProtocolMethodTypes(Twine Name, - ArrayRef MethodTypes, - const ObjCCommonTypesHelper &ObjCTypes) { +llvm::Constant *CGObjCCommonMac::EmitProtocolMethodTypes( + Twine Name, ArrayRef MethodTypes, + const ObjCCommonTypesHelper &ObjCTypes) { // Return null for empty list. if (MethodTypes.empty()) return llvm::Constant::getNullValue(ObjCTypes.Int8PtrPtrTy); - llvm::ArrayType *AT = llvm::ArrayType::get(ObjCTypes.Int8PtrTy, - MethodTypes.size()); + llvm::ArrayType *AT = + llvm::ArrayType::get(ObjCTypes.Int8PtrTy, MethodTypes.size()); llvm::Constant *Init = llvm::ConstantArray::get(AT, MethodTypes); StringRef Section; @@ -3337,20 +3256,16 @@ void CGObjCMac::GenerateCategory(const ObjCCategoryImplDecl *OCD) { // @implementation so everyone else can live life under a clear blue sky. const ObjCInterfaceDecl *Interface = OCD->getClassInterface(); const ObjCCategoryDecl *Category = - Interface->FindCategoryDeclaration(OCD->getIdentifier()); + Interface->FindCategoryDeclaration(OCD->getIdentifier()); SmallString<256> ExtName; - llvm::raw_svector_ostream(ExtName) << Interface->getName() << '_' - << OCD->getName(); + llvm::raw_svector_ostream(ExtName) + << Interface->getName() << '_' << OCD->getName(); ConstantInitBuilder Builder(CGM); auto Values = Builder.beginStruct(ObjCTypes.CategoryTy); - enum { - InstanceMethods, - ClassMethods, - NumMethodLists - }; + enum { InstanceMethods, ClassMethods, NumMethodLists }; SmallVector Methods[NumMethodLists]; for (const auto *MD : OCD->methods()) { if (!MD->isDirectMethod()) @@ -3366,9 +3281,9 @@ void CGObjCMac::GenerateCategory(const ObjCCategoryImplDecl *OCD) { Values.add(emitMethodList(ExtName, MethodListType::CategoryClassMethods, Methods[ClassMethods])); if (Category) { - Values.add( - EmitProtocolList("OBJC_CATEGORY_PROTOCOLS_" + ExtName.str(), - Category->protocol_begin(), Category->protocol_end())); + Values.add(EmitProtocolList("OBJC_CATEGORY_PROTOCOLS_" + ExtName.str(), + Category->protocol_begin(), + Category->protocol_end())); } else { Values.addNullPointer(ObjCTypes.ProtocolListPtrTy); } @@ -3376,25 +3291,25 @@ void CGObjCMac::GenerateCategory(const ObjCCategoryImplDecl *OCD) { // If there is no category @interface then there can be no properties. if (Category) { - Values.add(EmitPropertyList("_OBJC_$_PROP_LIST_" + ExtName.str(), - OCD, Category, ObjCTypes, false)); - Values.add(EmitPropertyList("_OBJC_$_CLASS_PROP_LIST_" + ExtName.str(), - OCD, Category, ObjCTypes, true)); + Values.add(EmitPropertyList("_OBJC_$_PROP_LIST_" + ExtName.str(), OCD, + Category, ObjCTypes, false)); + Values.add(EmitPropertyList("_OBJC_$_CLASS_PROP_LIST_" + ExtName.str(), OCD, + Category, ObjCTypes, true)); } else { Values.addNullPointer(ObjCTypes.PropertyListPtrTy); Values.addNullPointer(ObjCTypes.PropertyListPtrTy); } - llvm::GlobalVariable *GV = - CreateMetadataVar("OBJC_CATEGORY_" + ExtName.str(), Values, - "__OBJC,__category,regular,no_dead_strip", - CGM.getPointerAlign(), true); + llvm::GlobalVariable *GV = CreateMetadataVar( + "OBJC_CATEGORY_" + ExtName.str(), Values, + "__OBJC,__category,regular,no_dead_strip", CGM.getPointerAlign(), true); DefinedCategories.push_back(GV); DefinedCategoryNames.insert(llvm::CachedHashString(ExtName)); // method definition entries must be clear for next implementation. MethodDefinitions.clear(); } +// clang-format off enum FragileClassFlags { /// Apparently: is not a meta-class. FragileABI_Class_Factory = 0x00001, @@ -3445,6 +3360,7 @@ enum NonFragileClassFlags { /// Exclusive with CompiledByARC. NonFragileABI_Class_HasMRCWeakIvars = 0x00200, }; +// clang-format on static bool hasWeakMember(QualType type) { if (type.getObjCLifetime() == Qualifiers::OCL_Weak) { @@ -3466,11 +3382,12 @@ static bool hasWeakMember(QualType type) { /// __weak ivars. static bool hasMRCWeakIvars(CodeGenModule &CGM, const ObjCImplementationDecl *ID) { - if (!CGM.getLangOpts().ObjCWeak) return false; + if (!CGM.getLangOpts().ObjCWeak) + return false; assert(CGM.getLangOpts().getGC() == LangOptions::NonGC); for (const ObjCIvarDecl *ivar = - ID->getClassInterface()->all_declared_ivar_begin(); + ID->getClassInterface()->all_declared_ivar_begin(); ivar; ivar = ivar->getNextIvar()) { if (hasWeakMember(ivar->getType())) return true; @@ -3506,7 +3423,7 @@ void CGObjCMac::GenerateClass(const ObjCImplementationDecl *ID) { std::string ClassName = ID->getNameAsString(); // FIXME: Gross ObjCInterfaceDecl *Interface = - const_cast(ID->getClassInterface()); + const_cast(ID->getClassInterface()); llvm::Constant *Protocols = EmitProtocolList("OBJC_CLASS_PROTOCOLS_" + ID->getName(), Interface->all_referenced_protocol_begin(), @@ -3523,17 +3440,13 @@ void CGObjCMac::GenerateClass(const ObjCImplementationDecl *ID) { Flags |= FragileABI_Class_HasMRCWeakIvars; CharUnits Size = - CGM.getContext().getASTObjCImplementationLayout(ID).getSize(); + CGM.getContext().getASTObjCImplementationLayout(ID).getSize(); // FIXME: Set CXX-structors flag. if (ID->getClassInterface()->getVisibility() == HiddenVisibility) Flags |= FragileABI_Class_Hidden; - enum { - InstanceMethods, - ClassMethods, - NumMethodLists - }; + enum { InstanceMethods, ClassMethods, NumMethodLists }; SmallVector Methods[NumMethodLists]; for (const auto *MD : ID->methods()) { if (!MD->isDirectMethod()) @@ -3599,9 +3512,10 @@ void CGObjCMac::GenerateClass(const ObjCImplementationDecl *ID) { MethodDefinitions.clear(); } -llvm::Constant *CGObjCMac::EmitMetaClass(const ObjCImplementationDecl *ID, - llvm::Constant *Protocols, - ArrayRef Methods) { +llvm::Constant * +CGObjCMac::EmitMetaClass(const ObjCImplementationDecl *ID, + llvm::Constant *Protocols, + ArrayRef Methods) { unsigned Flags = FragileABI_Class_Meta; unsigned Size = CGM.getDataLayout().getTypeAllocSize(ObjCTypes.ClassTy); @@ -3629,16 +3543,16 @@ llvm::Constant *CGObjCMac::EmitMetaClass(const ObjCImplementationDecl *ID, values.addInt(ObjCTypes.LongTy, Flags); values.addInt(ObjCTypes.LongTy, Size); values.add(EmitIvarList(ID, true)); - values.add(emitMethodList(ID->getName(), MethodListType::ClassMethods, - Methods)); + values.add( + emitMethodList(ID->getName(), MethodListType::ClassMethods, Methods)); // cache is always NULL. values.addNullPointer(ObjCTypes.CachePtrTy); values.add(Protocols); // ivar_layout for metaclass is always NULL. values.addNullPointer(ObjCTypes.Int8PtrTy); // The class extension is used to store class properties for metaclasses. - values.add(EmitClassExtension(ID, CharUnits::Zero(), false/*hasMRCWeak*/, - /*isMetaclass*/true)); + values.add(EmitClassExtension(ID, CharUnits::Zero(), false /*hasMRCWeak*/, + /*isMetaclass*/ true)); std::string Name("OBJC_METACLASS_"); Name += ID->getName(); @@ -3707,10 +3621,10 @@ llvm::Value *CGObjCMac::EmitSuperClassRef(const ObjCInterfaceDecl *ID) { struct _objc_property_list *properties; }; */ -llvm::Constant * -CGObjCMac::EmitClassExtension(const ObjCImplementationDecl *ID, - CharUnits InstanceSize, bool hasMRCWeakIvars, - bool isMetaclass) { +llvm::Constant *CGObjCMac::EmitClassExtension(const ObjCImplementationDecl *ID, + CharUnits InstanceSize, + bool hasMRCWeakIvars, + bool isMetaclass) { // Weak ivar layout. llvm::Constant *layout; if (isMetaclass) { @@ -3722,10 +3636,10 @@ CGObjCMac::EmitClassExtension(const ObjCImplementationDecl *ID, // Properties. llvm::Constant *propertyList = - EmitPropertyList((isMetaclass ? Twine("_OBJC_$_CLASS_PROP_LIST_") - : Twine("_OBJC_$_PROP_LIST_")) - + ID->getName(), - ID, ID->getClassInterface(), ObjCTypes, isMetaclass); + EmitPropertyList((isMetaclass ? Twine("_OBJC_$_CLASS_PROP_LIST_") + : Twine("_OBJC_$_PROP_LIST_")) + + ID->getName(), + ID, ID->getClassInterface(), ObjCTypes, isMetaclass); // Return null if no extension bits are used. if (layout->isNullValue() && propertyList->isNullValue()) { @@ -3733,7 +3647,7 @@ CGObjCMac::EmitClassExtension(const ObjCImplementationDecl *ID, } uint64_t size = - CGM.getDataLayout().getTypeAllocSize(ObjCTypes.ClassExtensionTy); + CGM.getDataLayout().getTypeAllocSize(ObjCTypes.ClassExtensionTy); ConstantInitBuilder builder(CGM); auto values = builder.beginStruct(ObjCTypes.ClassExtensionTy); @@ -3775,8 +3689,8 @@ llvm::Constant *CGObjCMac::EmitIvarList(const ObjCImplementationDecl *ID, auto countSlot = ivarList.addPlaceholder(); auto ivars = ivarList.beginArray(ObjCTypes.IvarTy); - for (const ObjCIvarDecl *IVD = OID->all_declared_ivar_begin(); - IVD; IVD = IVD->getNextIvar()) { + for (const ObjCIvarDecl *IVD = OID->all_declared_ivar_begin(); IVD; + IVD = IVD->getNextIvar()) { // Ignore unnamed bit-fields. if (!IVD->getDeclName()) continue; @@ -3852,8 +3766,9 @@ void CGObjCMac::emitMethodConstant(ConstantArrayBuilder &builder, /// int count; /// struct objc_method_description list[count]; /// }; -llvm::Constant *CGObjCMac::emitMethodList(Twine name, MethodListType MLT, - ArrayRef methods) { +llvm::Constant * +CGObjCMac::emitMethodList(Twine name, MethodListType MLT, + ArrayRef methods) { StringRef prefix; StringRef section; bool forProtocol = false; @@ -3902,9 +3817,9 @@ llvm::Constant *CGObjCMac::emitMethodList(Twine name, MethodListType MLT, // Return null for empty list. if (methods.empty()) - return llvm::Constant::getNullValue(forProtocol - ? ObjCTypes.MethodDescriptionListPtrTy - : ObjCTypes.MethodListPtrTy); + return llvm::Constant::getNullValue( + forProtocol ? ObjCTypes.MethodDescriptionListPtrTy + : ObjCTypes.MethodListPtrTy); // For protocols, this is an objc_method_description_list, which has // a slightly different structure. @@ -3952,9 +3867,8 @@ llvm::Function *CGObjCCommonMac::GenerateMethod(const ObjCMethodDecl *OMD, CodeGenTypes &Types = CGM.getTypes(); llvm::FunctionType *MethodTy = Types.GetFunctionType(Types.arrangeObjCMethodDeclaration(OMD)); - Method = - llvm::Function::Create(MethodTy, llvm::GlobalValue::InternalLinkage, - Name, &CGM.getModule()); + Method = llvm::Function::Create( + MethodTy, llvm::GlobalValue::InternalLinkage, Name, &CGM.getModule()); } MethodDefinitions.insert(std::make_pair(OMD, Method)); @@ -3986,7 +3900,7 @@ CGObjCCommonMac::GenerateDirectMethod(const ObjCMethodDecl *OMD, CodeGenTypes &Types = CGM.getTypes(); llvm::FunctionType *MethodTy = - Types.GetFunctionType(Types.arrangeObjCMethodDeclaration(OMD)); + Types.GetFunctionType(Types.arrangeObjCMethodDeclaration(OMD)); if (OldFn) { Fn = llvm::Function::Create(MethodTy, llvm::GlobalValue::ExternalLinkage, @@ -4096,11 +4010,10 @@ void CGObjCCommonMac::GenerateDirectMethodPrologue( } } -llvm::GlobalVariable *CGObjCCommonMac::CreateMetadataVar(Twine Name, - ConstantStructBuilder &Init, - StringRef Section, - CharUnits Align, - bool AddToUsed) { +llvm::GlobalVariable * +CGObjCCommonMac::CreateMetadataVar(Twine Name, ConstantStructBuilder &Init, + StringRef Section, CharUnits Align, + bool AddToUsed) { llvm::GlobalValue::LinkageTypes LT = getLinkageTypeForObjCMetadata(CGM, Section); llvm::GlobalVariable *GV = @@ -4136,10 +4049,18 @@ CGObjCCommonMac::CreateCStringLiteral(StringRef Name, ObjCLabelType Type, bool NullTerminate) { StringRef Label; switch (Type) { - case ObjCLabelType::ClassName: Label = "OBJC_CLASS_NAME_"; break; - case ObjCLabelType::MethodVarName: Label = "OBJC_METH_VAR_NAME_"; break; - case ObjCLabelType::MethodVarType: Label = "OBJC_METH_VAR_TYPE_"; break; - case ObjCLabelType::PropertyName: Label = "OBJC_PROP_NAME_ATTR_"; break; + case ObjCLabelType::ClassName: + Label = "OBJC_CLASS_NAME_"; + break; + case ObjCLabelType::MethodVarName: + Label = "OBJC_METH_VAR_NAME_"; + break; + case ObjCLabelType::MethodVarType: + Label = "OBJC_METH_VAR_TYPE_"; + break; + case ObjCLabelType::PropertyName: + Label = "OBJC_PROP_NAME_ATTR_"; + break; } bool NonFragile = ForceNonFragileABI || isNonFragileABI(); @@ -4166,10 +4087,9 @@ CGObjCCommonMac::CreateCStringLiteral(StringRef Name, ObjCLabelType Type, llvm::Constant *Value = llvm::ConstantDataArray::getString(VMContext, Name, NullTerminate); - llvm::GlobalVariable *GV = - new llvm::GlobalVariable(CGM.getModule(), Value->getType(), - /*isConstant=*/true, - llvm::GlobalValue::PrivateLinkage, Value, Label); + llvm::GlobalVariable *GV = new llvm::GlobalVariable( + CGM.getModule(), Value->getType(), + /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, Value, Label); if (CGM.getTriple().isOSBinFormatMachO()) GV->setSection(Section); GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); @@ -4228,85 +4148,84 @@ void CGObjCMac::EmitSynchronizedStmt(CodeGenFunction &CGF, } namespace { - struct PerformFragileFinally final : EHScopeStack::Cleanup { - const Stmt &S; - Address SyncArgSlot; - Address CallTryExitVar; - Address ExceptionData; - ObjCTypesHelper &ObjCTypes; - PerformFragileFinally(const Stmt *S, - Address SyncArgSlot, - Address CallTryExitVar, - Address ExceptionData, - ObjCTypesHelper *ObjCTypes) +struct PerformFragileFinally final : EHScopeStack::Cleanup { + const Stmt &S; + Address SyncArgSlot; + Address CallTryExitVar; + Address ExceptionData; + ObjCTypesHelper &ObjCTypes; + PerformFragileFinally(const Stmt *S, Address SyncArgSlot, + Address CallTryExitVar, Address ExceptionData, + ObjCTypesHelper *ObjCTypes) : S(*S), SyncArgSlot(SyncArgSlot), CallTryExitVar(CallTryExitVar), ExceptionData(ExceptionData), ObjCTypes(*ObjCTypes) {} - void Emit(CodeGenFunction &CGF, Flags flags) override { - // Check whether we need to call objc_exception_try_exit. - // In optimized code, this branch will always be folded. - llvm::BasicBlock *FinallyCallExit = + void Emit(CodeGenFunction &CGF, Flags flags) override { + // Check whether we need to call objc_exception_try_exit. + // In optimized code, this branch will always be folded. + llvm::BasicBlock *FinallyCallExit = CGF.createBasicBlock("finally.call_exit"); - llvm::BasicBlock *FinallyNoCallExit = + llvm::BasicBlock *FinallyNoCallExit = CGF.createBasicBlock("finally.no_call_exit"); - CGF.Builder.CreateCondBr(CGF.Builder.CreateLoad(CallTryExitVar), - FinallyCallExit, FinallyNoCallExit); + CGF.Builder.CreateCondBr(CGF.Builder.CreateLoad(CallTryExitVar), + FinallyCallExit, FinallyNoCallExit); - CGF.EmitBlock(FinallyCallExit); - CGF.EmitNounwindRuntimeCall(ObjCTypes.getExceptionTryExitFn(), - ExceptionData.emitRawPointer(CGF)); + CGF.EmitBlock(FinallyCallExit); + CGF.EmitNounwindRuntimeCall(ObjCTypes.getExceptionTryExitFn(), + ExceptionData.emitRawPointer(CGF)); - CGF.EmitBlock(FinallyNoCallExit); + CGF.EmitBlock(FinallyNoCallExit); - if (isa(S)) { - if (const ObjCAtFinallyStmt* FinallyStmt = + if (isa(S)) { + if (const ObjCAtFinallyStmt *FinallyStmt = cast(S).getFinallyStmt()) { - // Don't try to do the @finally if this is an EH cleanup. - if (flags.isForEHCleanup()) return; + // Don't try to do the @finally if this is an EH cleanup. + if (flags.isForEHCleanup()) + return; - // Save the current cleanup destination in case there's - // control flow inside the finally statement. - llvm::Value *CurCleanupDest = + // Save the current cleanup destination in case there's + // control flow inside the finally statement. + llvm::Value *CurCleanupDest = CGF.Builder.CreateLoad(CGF.getNormalCleanupDestSlot()); - CGF.EmitStmt(FinallyStmt->getFinallyBody()); + CGF.EmitStmt(FinallyStmt->getFinallyBody()); - if (CGF.HaveInsertPoint()) { - CGF.Builder.CreateStore(CurCleanupDest, - CGF.getNormalCleanupDestSlot()); - } else { - // Currently, the end of the cleanup must always exist. - CGF.EnsureInsertPoint(); - } + if (CGF.HaveInsertPoint()) { + CGF.Builder.CreateStore(CurCleanupDest, + CGF.getNormalCleanupDestSlot()); + } else { + // Currently, the end of the cleanup must always exist. + CGF.EnsureInsertPoint(); } - } else { - // Emit objc_sync_exit(expr); as finally's sole statement for - // @synchronized. - llvm::Value *SyncArg = CGF.Builder.CreateLoad(SyncArgSlot); - CGF.EmitNounwindRuntimeCall(ObjCTypes.getSyncExitFn(), SyncArg); } + } else { + // Emit objc_sync_exit(expr); as finally's sole statement for + // @synchronized. + llvm::Value *SyncArg = CGF.Builder.CreateLoad(SyncArgSlot); + CGF.EmitNounwindRuntimeCall(ObjCTypes.getSyncExitFn(), SyncArg); } - }; + } +}; - class FragileHazards { - CodeGenFunction &CGF; - SmallVector Locals; - llvm::DenseSet BlocksBeforeTry; +class FragileHazards { + CodeGenFunction &CGF; + SmallVector Locals; + llvm::DenseSet BlocksBeforeTry; - llvm::InlineAsm *ReadHazard; - llvm::InlineAsm *WriteHazard; + llvm::InlineAsm *ReadHazard; + llvm::InlineAsm *WriteHazard; - llvm::FunctionType *GetAsmFnType(); + llvm::FunctionType *GetAsmFnType(); - void collectLocals(); - void emitReadHazard(CGBuilderTy &Builder); + void collectLocals(); + void emitReadHazard(CGBuilderTy &Builder); - public: - FragileHazards(CodeGenFunction &CGF); +public: + FragileHazards(CodeGenFunction &CGF); - void emitWriteHazard(); - void emitHazardsInNewBlocks(); - }; + void emitWriteHazard(); + void emitHazardsInNewBlocks(); +}; } // end anonymous namespace /// Create the fragile-ABI read and write hazards based on the current @@ -4317,11 +4236,12 @@ namespace { FragileHazards::FragileHazards(CodeGenFunction &CGF) : CGF(CGF) { collectLocals(); - if (Locals.empty()) return; + if (Locals.empty()) + return; // Collect all the blocks in the function. - for (llvm::Function::iterator - I = CGF.CurFn->begin(), E = CGF.CurFn->end(); I != E; ++I) + for (llvm::Function::iterator I = CGF.CurFn->begin(), E = CGF.CurFn->end(); + I != E; ++I) BlocksBeforeTry.insert(&*I); llvm::FunctionType *AsmFnTy = GetAsmFnType(); @@ -4334,7 +4254,8 @@ FragileHazards::FragileHazards(CodeGenFunction &CGF) : CGF(CGF) { { std::string Constraint; for (unsigned I = 0, E = Locals.size(); I != E; ++I) { - if (I) Constraint += ','; + if (I) + Constraint += ','; Constraint += "*m"; } @@ -4348,7 +4269,8 @@ FragileHazards::FragileHazards(CodeGenFunction &CGF) : CGF(CGF) { { std::string Constraint; for (unsigned I = 0, E = Locals.size(); I != E; ++I) { - if (I) Constraint += ','; + if (I) + Constraint += ','; Constraint += "=*m"; } @@ -4358,13 +4280,16 @@ FragileHazards::FragileHazards(CodeGenFunction &CGF) : CGF(CGF) { /// Emit a write hazard at the current location. void FragileHazards::emitWriteHazard() { - if (Locals.empty()) return; + if (Locals.empty()) + return; llvm::CallInst *Call = CGF.EmitNounwindRuntimeCall(WriteHazard, Locals); for (auto Pair : llvm::enumerate(Locals)) - Call->addParamAttr(Pair.index(), llvm::Attribute::get( - CGF.getLLVMContext(), llvm::Attribute::ElementType, - cast(Pair.value())->getAllocatedType())); + Call->addParamAttr( + Pair.index(), + llvm::Attribute::get( + CGF.getLLVMContext(), llvm::Attribute::ElementType, + cast(Pair.value())->getAllocatedType())); } void FragileHazards::emitReadHazard(CGBuilderTy &Builder) { @@ -4373,27 +4298,31 @@ void FragileHazards::emitReadHazard(CGBuilderTy &Builder) { call->setDoesNotThrow(); call->setCallingConv(CGF.getRuntimeCC()); for (auto Pair : llvm::enumerate(Locals)) - call->addParamAttr(Pair.index(), llvm::Attribute::get( - Builder.getContext(), llvm::Attribute::ElementType, - cast(Pair.value())->getAllocatedType())); + call->addParamAttr( + Pair.index(), + llvm::Attribute::get( + Builder.getContext(), llvm::Attribute::ElementType, + cast(Pair.value())->getAllocatedType())); } /// Emit read hazards in all the protected blocks, i.e. all the blocks /// which have been inserted since the beginning of the try. void FragileHazards::emitHazardsInNewBlocks() { - if (Locals.empty()) return; + if (Locals.empty()) + return; CGBuilderTy Builder(CGF, CGF.getLLVMContext()); // Iterate through all blocks, skipping those prior to the try. - for (llvm::Function::iterator - FI = CGF.CurFn->begin(), FE = CGF.CurFn->end(); FI != FE; ++FI) { + for (llvm::Function::iterator FI = CGF.CurFn->begin(), FE = CGF.CurFn->end(); + FI != FE; ++FI) { llvm::BasicBlock &BB = *FI; - if (BlocksBeforeTry.count(&BB)) continue; + if (BlocksBeforeTry.count(&BB)) + continue; // Walk through all the calls in the block. - for (llvm::BasicBlock::iterator - BI = BB.begin(), BE = BB.end(); BI != BE; ++BI) { + for (llvm::BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE; + ++BI) { llvm::Instruction &I = *BI; // Ignore instructions that aren't non-intrinsic calls. @@ -4419,7 +4348,7 @@ void FragileHazards::emitHazardsInNewBlocks() { } } -static void addIfPresent(llvm::DenseSet &S, Address V) { +static void addIfPresent(llvm::DenseSet &S, Address V) { if (V.isValid()) if (llvm::Value *Ptr = V.getBasePointer()) S.insert(Ptr); @@ -4427,15 +4356,15 @@ static void addIfPresent(llvm::DenseSet &S, Address V) { void FragileHazards::collectLocals() { // Compute a set of allocas to ignore. - llvm::DenseSet AllocasToIgnore; + llvm::DenseSet AllocasToIgnore; addIfPresent(AllocasToIgnore, CGF.ReturnValue); addIfPresent(AllocasToIgnore, CGF.NormalCleanupDest); // Collect all the allocas currently in the function. This is // probably way too aggressive. llvm::BasicBlock &Entry = CGF.CurFn->getEntryBlock(); - for (llvm::BasicBlock::iterator - I = Entry.begin(), E = Entry.end(); I != E; ++I) + for (llvm::BasicBlock::iterator I = Entry.begin(), E = Entry.end(); I != E; + ++I) if (isa(*I) && !AllocasToIgnore.count(&*I)) Locals.push_back(&*I); } @@ -4562,12 +4491,12 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, // A destination for the fall-through edges of the catch handlers to // jump to. CodeGenFunction::JumpDest FinallyEnd = - CGF.getJumpDestInCurrentScope("finally.end"); + CGF.getJumpDestInCurrentScope("finally.end"); // A destination for the rethrow edge of the catch handlers to jump // to. CodeGenFunction::JumpDest FinallyRethrow = - CGF.getJumpDestInCurrentScope("finally.rethrow"); + CGF.getJumpDestInCurrentScope("finally.rethrow"); // For @synchronized, call objc_sync_enter(sync.expr). The // evaluation of the expression must occur before we enter the @@ -4577,7 +4506,7 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, Address SyncArgSlot = Address::invalid(); if (!isTry) { llvm::Value *SyncArg = - CGF.EmitScalarExpr(cast(S).getSynchExpr()); + CGF.EmitScalarExpr(cast(S).getSynchExpr()); SyncArg = CGF.Builder.CreateBitCast(SyncArg, ObjCTypes.ObjectPtrTy); CGF.EmitNounwindRuntimeCall(ObjCTypes.getSyncEnterFn(), SyncArg); @@ -4588,9 +4517,8 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, // Allocate memory for the setjmp buffer. This needs to be kept // live throughout the try and catch blocks. - Address ExceptionData = CGF.CreateTempAlloca(ObjCTypes.ExceptionDataTy, - CGF.getPointerAlign(), - "exceptiondata.ptr"); + Address ExceptionData = CGF.CreateTempAlloca( + ObjCTypes.ExceptionDataTy, CGF.getPointerAlign(), "exceptiondata.ptr"); // Create the fragile hazards. Note that this will not capture any // of the allocas required for exception processing, but will @@ -4606,9 +4534,8 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, // The setjmp-safety rule here is that we should always store to this // variable in a place that dominates the branch through the cleanup // without passing through any setjmps. - Address CallTryExitVar = CGF.CreateTempAlloca(CGF.Builder.getInt1Ty(), - CharUnits::One(), - "_call_try_exit"); + Address CallTryExitVar = CGF.CreateTempAlloca( + CGF.Builder.getInt1Ty(), CharUnits::One(), "_call_try_exit"); // A slot containing the exception to rethrow. Only needed when we // have both a @catch and a @finally. @@ -4616,10 +4543,8 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, // Push a normal cleanup to leave the try scope. CGF.EHStack.pushCleanup(NormalAndEHCleanup, &S, - SyncArgSlot, - CallTryExitVar, - ExceptionData, - &ObjCTypes); + SyncArgSlot, CallTryExitVar, + ExceptionData, &ObjCTypes); // Enter a try block: // - Call objc_exception_try_enter to push ExceptionData on top of @@ -4629,7 +4554,7 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, // - Call setjmp on the exception data buffer. llvm::Constant *Zero = llvm::ConstantInt::get(CGF.Builder.getInt32Ty(), 0); - llvm::Value *GEPIndexes[] = { Zero, Zero, Zero }; + llvm::Value *GEPIndexes[] = {Zero, Zero, Zero}; llvm::Value *SetJmpBuffer = CGF.Builder.CreateGEP( ObjCTypes.ExceptionDataTy, ExceptionData.emitRawPointer(CGF), GEPIndexes, "setjmp_buffer"); @@ -4642,7 +4567,7 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, llvm::BasicBlock *TryBlock = CGF.createBasicBlock("try"); llvm::BasicBlock *TryHandler = CGF.createBasicBlock("try.handler"); llvm::Value *DidCatch = - CGF.Builder.CreateIsNotNull(SetJmpResult, "did_catch_exception"); + CGF.Builder.CreateIsNotNull(SetJmpResult, "did_catch_exception"); CGF.Builder.CreateCondBr(DidCatch, TryHandler, TryBlock); // Emit the protected block. @@ -4678,7 +4603,7 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, // benefit of any @throws in the handlers. CGF.ObjCEHValueStack.push_back(Caught); - const ObjCAtTryStmt* AtTryStmt = cast(&S); + const ObjCAtTryStmt *AtTryStmt = cast(&S); bool HasFinally = (AtTryStmt->getFinallyStmt() != nullptr); @@ -4687,9 +4612,8 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, if (HasFinally) { // Save the currently-propagating exception before // objc_exception_try_enter clears the exception slot. - PropagatingExnVar = CGF.CreateTempAlloca(Caught->getType(), - CGF.getPointerAlign(), - "propagating_exception"); + PropagatingExnVar = CGF.CreateTempAlloca( + Caught->getType(), CGF.getPointerAlign(), "propagating_exception"); CGF.Builder.CreateStore(Caught, PropagatingExnVar); // Enter a new exception try block (in case a @catch block @@ -4697,13 +4621,12 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, CGF.EmitNounwindRuntimeCall(ObjCTypes.getExceptionTryEnterFn(), ExceptionData.emitRawPointer(CGF)); - llvm::CallInst *SetJmpResult = - CGF.EmitNounwindRuntimeCall(ObjCTypes.getSetJmpFn(), - SetJmpBuffer, "setjmp.result"); + llvm::CallInst *SetJmpResult = CGF.EmitNounwindRuntimeCall( + ObjCTypes.getSetJmpFn(), SetJmpBuffer, "setjmp.result"); SetJmpResult->setCanReturnTwice(); llvm::Value *Threw = - CGF.Builder.CreateIsNotNull(SetJmpResult, "did_catch_exception"); + CGF.Builder.CreateIsNotNull(SetJmpResult, "did_catch_exception"); CatchBlock = CGF.createBasicBlock("catch"); CatchHandler = CGF.createBasicBlock("catch_for_catch"); @@ -4767,10 +4690,9 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, // Check if the @catch block matches the exception object. llvm::Value *Class = EmitClassRef(CGF, IDecl); - llvm::Value *matchArgs[] = { Class, Caught }; - llvm::CallInst *Match = - CGF.EmitNounwindRuntimeCall(ObjCTypes.getExceptionMatchFn(), - matchArgs, "match"); + llvm::Value *matchArgs[] = {Class, Caught}; + llvm::CallInst *Match = CGF.EmitNounwindRuntimeCall( + ObjCTypes.getExceptionMatchFn(), matchArgs, "match"); llvm::BasicBlock *MatchedBlock = CGF.createBasicBlock("match"); llvm::BasicBlock *NextCatchBlock = CGF.createBasicBlock("catch.next"); @@ -4789,9 +4711,8 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, assert(CGF.HaveInsertPoint() && "DeclStmt destroyed insert point?"); // Initialize the catch variable. - llvm::Value *Tmp = - CGF.Builder.CreateBitCast(Caught, - CGF.ConvertType(CatchParam->getType())); + llvm::Value *Tmp = CGF.Builder.CreateBitCast( + Caught, CGF.ConvertType(CatchParam->getType())); EmitInitOfCatchParam(CGF, Tmp, CatchParam); CGF.EmitStmt(CatchStmt->getCatchBody()); @@ -4821,7 +4742,7 @@ void CGObjCMac::EmitTryOrSynchronizedStmt(CodeGen::CodeGenFunction &CGF, // In theory we might now need a write hazard, but actually it's // unnecessary because there's no local-accessing code between // the try's write hazard and here. - //Hazards.emitWriteHazard(); + // Hazards.emitWriteHazard(); // Extract the new exception and save it to the // propagating-exception slot. @@ -4879,7 +4800,7 @@ void CGObjCMac::EmitThrowStmt(CodeGen::CodeGenFunction &CGF, if (const Expr *ThrowExpr = S.getThrowExpr()) { llvm::Value *Exception = CGF.EmitObjCThrowOperand(ThrowExpr); ExceptionAsObject = - CGF.Builder.CreateBitCast(Exception, ObjCTypes.ObjectPtrTy); + CGF.Builder.CreateBitCast(Exception, ObjCTypes.ObjectPtrTy); } else { assert((!CGF.ObjCEHValueStack.empty() && CGF.ObjCEHValueStack.back()) && "Unexpected rethrow outside @catch block."); @@ -4887,7 +4808,7 @@ void CGObjCMac::EmitThrowStmt(CodeGen::CodeGenFunction &CGF, } CGF.EmitRuntimeCall(ObjCTypes.getExceptionThrowFn(), ExceptionAsObject) - ->setDoesNotReturn(); + ->setDoesNotReturn(); CGF.Builder.CreateUnreachable(); // Clear the insertion point to indicate we are in unreachable code. @@ -4898,14 +4819,13 @@ void CGObjCMac::EmitThrowStmt(CodeGen::CodeGenFunction &CGF, /// EmitObjCWeakRead - Code gen for loading value of a __weak /// object: objc_read_weak (id *src) /// -llvm::Value * CGObjCMac::EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF, - Address AddrWeakObj) { - llvm::Type* DestTy = AddrWeakObj.getElementType(); +llvm::Value *CGObjCMac::EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF, + Address AddrWeakObj) { + llvm::Type *DestTy = AddrWeakObj.getElementType(); llvm::Value *AddrWeakObjVal = CGF.Builder.CreateBitCast( AddrWeakObj.emitRawPointer(CGF), ObjCTypes.PtrObjectPtrTy); - llvm::Value *read_weak = - CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcReadWeakFn(), - AddrWeakObjVal, "weakread"); + llvm::Value *read_weak = CGF.EmitNounwindRuntimeCall( + ObjCTypes.getGcReadWeakFn(), AddrWeakObjVal, "weakread"); read_weak = CGF.Builder.CreateBitCast(read_weak, DestTy); return read_weak; } @@ -4915,7 +4835,7 @@ llvm::Value * CGObjCMac::EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF, /// void CGObjCMac::EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, Address dst) { - llvm::Type * SrcTy = src->getType(); + llvm::Type *SrcTy = src->getType(); if (!isa(SrcTy)) { unsigned Size = CGM.getDataLayout().getTypeAllocSize(SrcTy); assert(Size <= 8 && "does not support size > 8"); @@ -4926,9 +4846,9 @@ void CGObjCMac::EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF, src = CGF.Builder.CreateBitCast(src, ObjCTypes.ObjectPtrTy); llvm::Value *dstVal = CGF.Builder.CreateBitCast(dst.emitRawPointer(CGF), ObjCTypes.PtrObjectPtrTy); - llvm::Value *args[] = { src, dstVal }; - CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignWeakFn(), - args, "weakassign"); + llvm::Value *args[] = {src, dstVal}; + CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignWeakFn(), args, + "weakassign"); } /// EmitObjCGlobalAssign - Code gen for assigning to a __strong object. @@ -4937,7 +4857,7 @@ void CGObjCMac::EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF, void CGObjCMac::EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, Address dst, bool threadlocal) { - llvm::Type * SrcTy = src->getType(); + llvm::Type *SrcTy = src->getType(); if (!isa(SrcTy)) { unsigned Size = CGM.getDataLayout().getTypeAllocSize(SrcTy); assert(Size <= 8 && "does not support size > 8"); @@ -4950,11 +4870,11 @@ void CGObjCMac::EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF, ObjCTypes.PtrObjectPtrTy); llvm::Value *args[] = {src, dstVal}; if (!threadlocal) - CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignGlobalFn(), - args, "globalassign"); + CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignGlobalFn(), args, + "globalassign"); else - CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignThreadLocalFn(), - args, "threadlocalassign"); + CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignThreadLocalFn(), args, + "threadlocalassign"); } /// EmitObjCIvarAssign - Code gen for assigning to a __strong object. @@ -4964,7 +4884,7 @@ void CGObjCMac::EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, Address dst, llvm::Value *ivarOffset) { assert(ivarOffset && "EmitObjCIvarAssign - ivarOffset is NULL"); - llvm::Type * SrcTy = src->getType(); + llvm::Type *SrcTy = src->getType(); if (!isa(SrcTy)) { unsigned Size = CGM.getDataLayout().getTypeAllocSize(SrcTy); assert(Size <= 8 && "does not support size > 8"); @@ -4984,7 +4904,7 @@ void CGObjCMac::EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF, /// void CGObjCMac::EmitObjCStrongCastAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, Address dst) { - llvm::Type * SrcTy = src->getType(); + llvm::Type *SrcTy = src->getType(); if (!isa(SrcTy)) { unsigned Size = CGM.getDataLayout().getTypeAllocSize(SrcTy); assert(Size <= 8 && "does not support size > 8"); @@ -4996,8 +4916,8 @@ void CGObjCMac::EmitObjCStrongCastAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *dstVal = CGF.Builder.CreateBitCast(dst.emitRawPointer(CGF), ObjCTypes.PtrObjectPtrTy); llvm::Value *args[] = {src, dstVal}; - CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignStrongCastFn(), - args, "strongassign"); + CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignStrongCastFn(), args, + "strongassign"); } void CGObjCMac::EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF, @@ -5016,7 +4936,7 @@ LValue CGObjCMac::EmitObjCValueForIvar(CodeGen::CodeGenFunction &CGF, const ObjCIvarDecl *Ivar, unsigned CVRQualifiers) { const ObjCInterfaceDecl *ID = - ObjectTy->castAs()->getInterface(); + ObjectTy->castAs()->getInterface(); return EmitValueForIvarAtOffset(CGF, ID, BaseValue, Ivar, CVRQualifiers, EmitIvarOffset(CGF, ID, Ivar)); } @@ -5026,8 +4946,7 @@ llvm::Value *CGObjCMac::EmitIvarOffset(CodeGen::CodeGenFunction &CGF, const ObjCIvarDecl *Ivar) { uint64_t Offset = ComputeIvarBaseOffset(CGM, Interface, Ivar); return llvm::ConstantInt::get( - CGM.getTypes().ConvertType(CGM.getContext().LongTy), - Offset); + CGM.getTypes().ConvertType(CGM.getContext().LongTy), Offset); } /* *** Private Interface *** */ @@ -5060,6 +4979,7 @@ std::string CGObjCCommonMac::GetSectionName(StringRef Section, llvm_unreachable("Unhandled llvm::Triple::ObjectFormatType enum"); } +// clang-format off /// EmitImageInfo - Emit the image info marker used to encode some module /// level information. /// @@ -5081,6 +5001,7 @@ enum ImageInfoFlags { eImageInfo_ImageIsSimulated = (1 << 5), eImageInfo_ClassProperties = (1 << 6) }; +// clang-format on void CGObjCCommonMac::EmitImageInfo() { unsigned version = 0; // Version is unused? @@ -5103,15 +5024,13 @@ void CGObjCCommonMac::EmitImageInfo() { auto Int8Ty = llvm::Type::getInt8Ty(VMContext); if (CGM.getLangOpts().getGC() == LangOptions::NonGC) { // Non-GC overrides those files which specify GC. - Mod.addModuleFlag(llvm::Module::Error, - "Objective-C Garbage Collection", - llvm::ConstantInt::get(Int8Ty,0)); + Mod.addModuleFlag(llvm::Module::Error, "Objective-C Garbage Collection", + llvm::ConstantInt::get(Int8Ty, 0)); } else { // Add the ObjC garbage collection value. - Mod.addModuleFlag(llvm::Module::Error, - "Objective-C Garbage Collection", - llvm::ConstantInt::get(Int8Ty, - (uint8_t)eImageInfo_GarbageCollected)); + Mod.addModuleFlag( + llvm::Module::Error, "Objective-C Garbage Collection", + llvm::ConstantInt::get(Int8Ty, (uint8_t)eImageInfo_GarbageCollected)); if (CGM.getLangOpts().getGC() == LangOptions::GCOnly) { // Add the ObjC GC Only value. @@ -5121,8 +5040,8 @@ void CGObjCCommonMac::EmitImageInfo() { // Require that GC be specified and set to eImageInfo_GarbageCollected. llvm::Metadata *Ops[2] = { llvm::MDString::get(VMContext, "Objective-C Garbage Collection"), - llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( - Int8Ty, eImageInfo_GarbageCollected))}; + llvm::ConstantAsMetadata::get( + llvm::ConstantInt::get(Int8Ty, eImageInfo_GarbageCollected))}; Mod.addModuleFlag(llvm::Module::Require, "Objective-C GC Only", llvm::MDNode::get(VMContext, Ops)); } @@ -5181,7 +5100,7 @@ llvm::Constant *CGObjCMac::EmitModuleSymbols() { // The runtime expects exactly the list of defined classes followed // by the list of defined categories, in a single array. auto array = values.beginArray(ObjCTypes.Int8PtrTy); - for (unsigned i=0; igetImplementation()) @@ -5191,7 +5110,7 @@ llvm::Constant *CGObjCMac::EmitModuleSymbols() { array.add(DefinedClasses[i]); } - for (unsigned i=0; igetDecl(); // If this is a union, remember that we had one, because it might mess @@ -5284,11 +5203,12 @@ void IvarLayoutBuilder::visitRecord(const RecordType *RT, const ASTRecordLayout *recLayout = nullptr; visitAggregate(RD->field_begin(), RD->field_end(), offset, [&](const FieldDecl *field) -> CharUnits { - if (!recLayout) - recLayout = &CGM.getContext().getASTRecordLayout(RD); - auto offsetInBits = recLayout->getFieldOffset(field->getFieldIndex()); - return CGM.getContext().toCharUnitsFromBits(offsetInBits); - }); + if (!recLayout) + recLayout = &CGM.getContext().getASTRecordLayout(RD); + auto offsetInBits = + recLayout->getFieldOffset(field->getFieldIndex()); + return CGM.getContext().toCharUnitsFromBits(offsetInBits); + }); } template @@ -5331,7 +5251,8 @@ void IvarLayoutBuilder::visitField(const FieldDecl *field, // If we ended up with a zero-sized array, we've done what we can do within // the limits of this layout encoding. - if (numElts == 0) return; + if (numElts == 0) + return; // Recurse if the base element type is a record type. if (auto recType = fieldType->getAs()) { @@ -5361,10 +5282,10 @@ void IvarLayoutBuilder::visitField(const FieldDecl *field, Qualifiers::GC GCAttr = GetGCAttrTypeForType(CGM.getContext(), fieldType); // If it matches what we're looking for, add an entry. - if ((ForStrongLayout && GCAttr == Qualifiers::Strong) - || (!ForStrongLayout && GCAttr == Qualifiers::Weak)) { - assert(CGM.getContext().getTypeSizeInChars(fieldType) - == CGM.getPointerSize()); + if ((ForStrongLayout && GCAttr == Qualifiers::Strong) || + (!ForStrongLayout && GCAttr == Qualifiers::Weak)) { + assert(CGM.getContext().getTypeSizeInChars(fieldType) == + CGM.getPointerSize()); IvarsInfo.push_back(IvarInfo(fieldOffset, numElts)); } } @@ -5372,8 +5293,9 @@ void IvarLayoutBuilder::visitField(const FieldDecl *field, /// buildBitmap - This routine does the horsework of taking the offsets of /// strong/weak references and creating a bitmap. The bitmap is also /// returned in the given buffer, suitable for being passed to \c dump(). -llvm::Constant *IvarLayoutBuilder::buildBitmap(CGObjCCommonMac &CGObjC, - llvm::SmallVectorImpl &buffer) { +llvm::Constant * +IvarLayoutBuilder::buildBitmap(CGObjCCommonMac &CGObjC, + llvm::SmallVectorImpl &buffer) { // The bitmap is a series of skip/scan instructions, aligned to word // boundaries. The skip is performed first. const unsigned char MaxNibble = 0xF; @@ -5454,7 +5376,8 @@ llvm::Constant *IvarLayoutBuilder::buildBitmap(CGObjCCommonMac &CGObjC, // Ignore scan requests that don't start at an even multiple of the // word size. We can't encode them. - if ((beginOfScan % WordSize) != 0) continue; + if ((beginOfScan % WordSize) != 0) + continue; // Ignore scan requests that start before the instance start. // This assumes that scans never span that boundary. The boundary @@ -5479,7 +5402,8 @@ llvm::Constant *IvarLayoutBuilder::buildBitmap(CGObjCCommonMac &CGObjC, beginOfScanInWords = endOfLastScanInWords; // If that leaves us with nothing to scan, ignore this request. - if (beginOfScanInWords >= endOfScanInWords) continue; + if (beginOfScanInWords >= endOfScanInWords) + continue; } // Scan to the end of the request. @@ -5496,7 +5420,7 @@ llvm::Constant *IvarLayoutBuilder::buildBitmap(CGObjCCommonMac &CGObjC, // or necessary for the ARC-style layout strings. if (CGM.getLangOpts().getGC() != LangOptions::NonGC) { unsigned lastOffsetInWords = - (InstanceEnd - InstanceBegin + WordSize - CharUnits::One()) / WordSize; + (InstanceEnd - InstanceBegin + WordSize - CharUnits::One()) / WordSize; if (lastOffsetInWords > endOfLastScanInWords) { skip(lastOffsetInWords - endOfLastScanInWords); } @@ -5539,7 +5463,7 @@ CGObjCCommonMac::BuildIvarLayout(const ObjCImplementationDecl *OMD, return llvm::Constant::getNullValue(PtrTy); const ObjCInterfaceDecl *OI = OMD->getClassInterface(); - SmallVector ivars; + SmallVector ivars; // GC layout strings include the complete object layout, possibly // inaccurately in the non-fragile ABI; the runtime knows how to fix this @@ -5553,22 +5477,21 @@ CGObjCCommonMac::BuildIvarLayout(const ObjCImplementationDecl *OMD, // MRC weak layout strings follow the ARC style. CharUnits baseOffset; if (CGM.getLangOpts().getGC() == LangOptions::NonGC) { - for (const ObjCIvarDecl *IVD = OI->all_declared_ivar_begin(); - IVD; IVD = IVD->getNextIvar()) + for (const ObjCIvarDecl *IVD = OI->all_declared_ivar_begin(); IVD; + IVD = IVD->getNextIvar()) ivars.push_back(IVD); if (isNonFragileABI()) { baseOffset = beginOffset; // InstanceStart } else if (!ivars.empty()) { baseOffset = - CharUnits::fromQuantity(ComputeIvarBaseOffset(CGM, OMD, ivars[0])); + CharUnits::fromQuantity(ComputeIvarBaseOffset(CGM, OMD, ivars[0])); } else { baseOffset = CharUnits::Zero(); } baseOffset = baseOffset.alignTo(CGM.getPointerAlign()); - } - else { + } else { CGM.getContext().DeepCollectObjCIvars(OI, true, ivars); baseOffset = CharUnits::Zero(); @@ -5581,8 +5504,9 @@ CGObjCCommonMac::BuildIvarLayout(const ObjCImplementationDecl *OMD, builder.visitAggregate(ivars.begin(), ivars.end(), CharUnits::Zero(), [&](const ObjCIvarDecl *ivar) -> CharUnits { - return CharUnits::fromQuantity(ComputeIvarBaseOffset(CGM, OMD, ivar)); - }); + return CharUnits::fromQuantity( + ComputeIvarBaseOffset(CGM, OMD, ivar)); + }); if (!builder.hasBitmapData()) return llvm::Constant::getNullValue(PtrTy); @@ -5590,7 +5514,7 @@ CGObjCCommonMac::BuildIvarLayout(const ObjCImplementationDecl *OMD, llvm::SmallVector buffer; llvm::Constant *C = builder.buildBitmap(*this, buffer); - if (CGM.getLangOpts().ObjCGCBitmapPrint && !buffer.empty()) { + if (CGM.getLangOpts().ObjCGCBitmapPrint && !buffer.empty()) { printf("\n%s ivar layout for class '%s': ", ForStrongLayout ? "strong" : "weak", OMD->getClassInterface()->getName().str().c_str()); @@ -5603,7 +5527,8 @@ llvm::Constant *CGObjCCommonMac::GetMethodVarName(Selector Sel) { llvm::GlobalVariable *&Entry = MethodVarNames[Sel]; // FIXME: Avoid std::string in "Sel.getAsString()" if (!Entry) - Entry = CreateCStringLiteral(Sel.getAsString(), ObjCLabelType::MethodVarName); + Entry = + CreateCStringLiteral(Sel.getAsString(), ObjCLabelType::MethodVarName); return getConstantGEP(VMContext, Entry, 0, 0); } @@ -5625,7 +5550,7 @@ llvm::Constant *CGObjCCommonMac::GetMethodVarType(const FieldDecl *Field) { llvm::Constant *CGObjCCommonMac::GetMethodVarType(const ObjCMethodDecl *D, bool Extended) { std::string TypeStr = - CGM.getContext().getObjCEncodingForMethodDecl(D, Extended); + CGM.getContext().getObjCEncodingForMethodDecl(D, Extended); llvm::GlobalVariable *&Entry = MethodVarTypes[TypeStr]; if (!Entry) @@ -5647,7 +5572,7 @@ llvm::Constant * CGObjCCommonMac::GetPropertyTypeString(const ObjCPropertyDecl *PD, const Decl *Container) { std::string TypeStr = - CGM.getContext().getObjCEncodingForPropertyDecl(PD, Container); + CGM.getContext().getObjCEncodingForPropertyDecl(PD, Container); return GetPropertyName(&CGM.getContext().Idents.get(TypeStr)); } @@ -5707,8 +5632,8 @@ CGObjCNonFragileABIMac::CGObjCNonFragileABIMac(CodeGen::CodeGenModule &cgm) /* *** */ ObjCCommonTypesHelper::ObjCCommonTypesHelper(CodeGen::CodeGenModule &cgm) - : VMContext(cgm.getLLVMContext()), CGM(cgm), ExternalProtocolPtrTy(nullptr) -{ + : VMContext(cgm.getLLVMContext()), CGM(cgm), + ExternalProtocolPtrTy(nullptr) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); unsigned ProgramAS = CGM.getDataLayout().getProgramAddressSpace(); @@ -5727,12 +5652,10 @@ ObjCCommonTypesHelper::ObjCCommonTypesHelper(CodeGen::CodeGenModule &cgm) else IvarOffsetVarTy = LongTy; - ObjectPtrTy = - cast(Types.ConvertType(Ctx.getObjCIdType())); - PtrObjectPtrTy = - llvm::PointerType::getUnqual(ObjectPtrTy); + ObjectPtrTy = cast(Types.ConvertType(Ctx.getObjCIdType())); + PtrObjectPtrTy = llvm::PointerType::getUnqual(ObjectPtrTy); SelectorPtrTy = - cast(Types.ConvertType(Ctx.getObjCSelType())); + cast(Types.ConvertType(Ctx.getObjCSelType())); // I'm not sure I like this. The implicit coordination is a bit // gross. We should solve this in a reasonable fashion because this @@ -5793,7 +5716,7 @@ ObjCCommonTypesHelper::ObjCCommonTypesHelper(CodeGen::CodeGenModule &cgm) } ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) - : ObjCCommonTypesHelper(cgm) { + : ObjCCommonTypesHelper(cgm) { // struct _objc_method_description { // SEL name; // char *types; @@ -5811,7 +5734,7 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) // struct _objc_method_description_list * MethodDescriptionListPtrTy = - llvm::PointerType::getUnqual(MethodDescriptionListTy); + llvm::PointerType::getUnqual(MethodDescriptionListTy); // Protocol description structures @@ -5867,13 +5790,12 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) IntTy); // struct _objc_ivar_list * - IvarListTy = - llvm::StructType::create(VMContext, "struct._objc_ivar_list"); + IvarListTy = llvm::StructType::create(VMContext, "struct._objc_ivar_list"); IvarListPtrTy = llvm::PointerType::getUnqual(IvarListTy); // struct _objc_method_list * MethodListTy = - llvm::StructType::create(VMContext, "struct._objc_method_list"); + llvm::StructType::create(VMContext, "struct._objc_method_list"); MethodListPtrTy = llvm::PointerType::getUnqual(MethodListTy); // struct _objc_class_extension * @@ -5954,8 +5876,9 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) llvm::ArrayType::get(CGM.Int32Ty, SetJmpBufferSize), StackPtrTy); } -ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper(CodeGen::CodeGenModule &cgm) - : ObjCCommonTypesHelper(cgm) { +ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper( + CodeGen::CodeGenModule &cgm) + : ObjCCommonTypesHelper(cgm) { // struct _method_list_t { // uint32_t entsize; // sizeof(struct _objc_method) // uint32_t method_count; @@ -6122,7 +6045,6 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper(CodeGen::CodeGenModul // SuperMessageRefPtrTy - LLVM for struct _super_message_ref_t* SuperMessageRefPtrTy = llvm::PointerType::getUnqual(SuperMessageRefTy); - // struct objc_typeinfo { // const void** vtable; // objc_ehtype_vtable + 2 // const char* name; // c++ typeinfo string @@ -6148,14 +6070,12 @@ void CGObjCNonFragileABIMac::AddModuleClassList( if (!NumClasses) return; - SmallVector Symbols(NumClasses); - for (unsigned i=0; i Symbols(NumClasses); + for (unsigned i = 0; i < NumClasses; i++) Symbols[i] = Container[i]; - llvm::Constant *Init = - llvm::ConstantArray::get(llvm::ArrayType::get(ObjCTypes.Int8PtrTy, - Symbols.size()), - Symbols); + llvm::Constant *Init = llvm::ConstantArray::get( + llvm::ArrayType::get(ObjCTypes.Int8PtrTy, Symbols.size()), Symbols); // Section name is obtained by calling GetSectionName, which returns // sections in the __DATA segment on MachO. @@ -6176,36 +6096,37 @@ void CGObjCNonFragileABIMac::FinishNonFragileABIModule() { // Build list of all implemented class addresses in array // L_OBJC_LABEL_CLASS_$. - for (unsigned i=0, NumClasses=ImplementedClasses.size(); igetImplementation()) // We are implementing a weak imported interface. Give it external linkage if (ID->isWeakImported() && !IMP->isWeakImported()) { DefinedClasses[i]->setLinkage(llvm::GlobalVariable::ExternalLinkage); - DefinedMetaClasses[i]->setLinkage(llvm::GlobalVariable::ExternalLinkage); + DefinedMetaClasses[i]->setLinkage( + llvm::GlobalVariable::ExternalLinkage); } } - AddModuleClassList(DefinedClasses, "OBJC_LABEL_CLASS_$", - GetSectionName("__objc_classlist", - "regular,no_dead_strip")); + AddModuleClassList( + DefinedClasses, "OBJC_LABEL_CLASS_$", + GetSectionName("__objc_classlist", "regular,no_dead_strip")); - AddModuleClassList(DefinedNonLazyClasses, "OBJC_LABEL_NONLAZY_CLASS_$", - GetSectionName("__objc_nlclslist", - "regular,no_dead_strip")); + AddModuleClassList( + DefinedNonLazyClasses, "OBJC_LABEL_NONLAZY_CLASS_$", + GetSectionName("__objc_nlclslist", "regular,no_dead_strip")); // Build list of all implemented category addresses in array // L_OBJC_LABEL_CATEGORY_$. AddModuleClassList(DefinedCategories, "OBJC_LABEL_CATEGORY_$", - GetSectionName("__objc_catlist", - "regular,no_dead_strip")); - AddModuleClassList(DefinedStubCategories, "OBJC_LABEL_STUB_CATEGORY_$", - GetSectionName("__objc_catlist2", - "regular,no_dead_strip")); - AddModuleClassList(DefinedNonLazyCategories, "OBJC_LABEL_NONLAZY_CATEGORY_$", - GetSectionName("__objc_nlcatlist", - "regular,no_dead_strip")); + GetSectionName("__objc_catlist", "regular,no_dead_strip")); + AddModuleClassList( + DefinedStubCategories, "OBJC_LABEL_STUB_CATEGORY_$", + GetSectionName("__objc_catlist2", "regular,no_dead_strip")); + AddModuleClassList( + DefinedNonLazyCategories, "OBJC_LABEL_NONLAZY_CATEGORY_$", + GetSectionName("__objc_nlcatlist", "regular,no_dead_strip")); EmitImageInfo(); } @@ -6264,7 +6185,7 @@ bool CGObjCNonFragileABIMac::isVTableDispatchedSelector(Selector Sel) { &CGM.getContext().Idents.get("objects"), &CGM.getContext().Idents.get("count")}; VTableDispatchMethods.insert( - CGM.getContext().Selectors.getSelector(3, KeyIdents)); + CGM.getContext().Selectors.getSelector(3, KeyIdents)); } } @@ -6286,11 +6207,9 @@ bool CGObjCNonFragileABIMac::isVTableDispatchedSelector(Selector Sel) { /// const struct _prop_list_t * const properties; /// } /// -llvm::GlobalVariable * CGObjCNonFragileABIMac::BuildClassRoTInitializer( - unsigned flags, - unsigned InstanceStart, - unsigned InstanceSize, - const ObjCImplementationDecl *ID) { +llvm::GlobalVariable *CGObjCNonFragileABIMac::BuildClassRoTInitializer( + unsigned flags, unsigned InstanceStart, unsigned InstanceSize, + const ObjCImplementationDecl *ID) { std::string ClassName = std::string(ID->getObjCRuntimeNameAsString()); CharUnits beginInstance = CharUnits::fromQuantity(InstanceStart); @@ -6309,12 +6228,12 @@ llvm::GlobalVariable * CGObjCNonFragileABIMac::BuildClassRoTInitializer( values.addInt(ObjCTypes.IntTy, InstanceStart); values.addInt(ObjCTypes.IntTy, InstanceSize); values.add((flags & NonFragileABI_Class_Meta) - ? GetIvarLayoutName(nullptr, ObjCTypes) - : BuildStrongIvarLayout(ID, beginInstance, endInstance)); + ? GetIvarLayoutName(nullptr, ObjCTypes) + : BuildStrongIvarLayout(ID, beginInstance, endInstance)); values.add(GetClassName(ID->getObjCRuntimeNameAsString())); // const struct _method_list_t * const baseMethods; - SmallVector methods; + SmallVector methods; if (flags & NonFragileABI_Class_Meta) { for (const auto *MD : ID->class_methods()) if (!MD->isDirectMethod()) @@ -6327,29 +6246,29 @@ llvm::GlobalVariable * CGObjCNonFragileABIMac::BuildClassRoTInitializer( values.add(emitMethodList(ID->getObjCRuntimeNameAsString(), (flags & NonFragileABI_Class_Meta) - ? MethodListType::ClassMethods - : MethodListType::InstanceMethods, + ? MethodListType::ClassMethods + : MethodListType::InstanceMethods, methods)); const ObjCInterfaceDecl *OID = ID->getClassInterface(); assert(OID && "CGObjCNonFragileABIMac::BuildClassRoTInitializer"); - values.add(EmitProtocolList("_OBJC_CLASS_PROTOCOLS_$_" - + OID->getObjCRuntimeNameAsString(), + values.add(EmitProtocolList("_OBJC_CLASS_PROTOCOLS_$_" + + OID->getObjCRuntimeNameAsString(), OID->all_referenced_protocol_begin(), OID->all_referenced_protocol_end())); if (flags & NonFragileABI_Class_Meta) { values.addNullPointer(ObjCTypes.IvarListnfABIPtrTy); values.add(GetIvarLayoutName(nullptr, ObjCTypes)); - values.add(EmitPropertyList( - "_OBJC_$_CLASS_PROP_LIST_" + ID->getObjCRuntimeNameAsString(), - ID, ID->getClassInterface(), ObjCTypes, true)); + values.add(EmitPropertyList("_OBJC_$_CLASS_PROP_LIST_" + + ID->getObjCRuntimeNameAsString(), + ID, ID->getClassInterface(), ObjCTypes, true)); } else { values.add(EmitIvarList(ID)); values.add(BuildWeakIvarLayout(ID, beginInstance, endInstance, hasMRCWeak)); - values.add(EmitPropertyList( - "_OBJC_$_PROP_LIST_" + ID->getObjCRuntimeNameAsString(), - ID, ID->getClassInterface(), ObjCTypes, false)); + values.add(EmitPropertyList("_OBJC_$_PROP_LIST_" + + ID->getObjCRuntimeNameAsString(), + ID, ID->getClassInterface(), ObjCTypes, false)); } llvm::SmallString<64> roLabel; @@ -6371,13 +6290,10 @@ llvm::GlobalVariable * CGObjCNonFragileABIMac::BuildClassRoTInitializer( /// struct class_ro_t *ro; /// } /// -llvm::GlobalVariable * -CGObjCNonFragileABIMac::BuildClassObject(const ObjCInterfaceDecl *CI, - bool isMetaclass, - llvm::Constant *IsAGV, - llvm::Constant *SuperClassGV, - llvm::Constant *ClassRoGV, - bool HiddenVisibility) { +llvm::GlobalVariable *CGObjCNonFragileABIMac::BuildClassObject( + const ObjCInterfaceDecl *CI, bool isMetaclass, llvm::Constant *IsAGV, + llvm::Constant *SuperClassGV, llvm::Constant *ClassRoGV, + bool HiddenVisibility) { ConstantInitBuilder builder(CGM); auto values = builder.beginStruct(ObjCTypes.ClassnfABITy); values.add(IsAGV); @@ -6390,8 +6306,8 @@ CGObjCNonFragileABIMac::BuildClassObject(const ObjCInterfaceDecl *CI, values.add(ObjCEmptyVtableVar); values.add(ClassRoGV); - llvm::GlobalVariable *GV = - cast(GetClassGlobal(CI, isMetaclass, ForDefinition)); + llvm::GlobalVariable *GV = cast( + GetClassGlobal(CI, isMetaclass, ForDefinition)); values.finishAndSetAsInitializer(GV); if (CGM.getTriple().isOSBinFormatMachO()) @@ -6414,7 +6330,7 @@ void CGObjCNonFragileABIMac::GetClassSizeInfo(const ObjCImplementationDecl *OID, uint32_t &InstanceStart, uint32_t &InstanceSize) { const ASTRecordLayout &RL = - CGM.getContext().getASTObjCImplementationLayout(OID); + CGM.getContext().getASTObjCImplementationLayout(OID); // InstanceSize is really instance end. InstanceSize = RL.getDataSize().getQuantity(); @@ -6448,27 +6364,26 @@ static llvm::GlobalValue::DLLStorageClassTypes getStorage(CodeGenModule &CGM, void CGObjCNonFragileABIMac::GenerateClass(const ObjCImplementationDecl *ID) { if (!ObjCEmptyCacheVar) { - ObjCEmptyCacheVar = - new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.CacheTy, false, - llvm::GlobalValue::ExternalLinkage, nullptr, - "_objc_empty_cache"); + ObjCEmptyCacheVar = new llvm::GlobalVariable( + CGM.getModule(), ObjCTypes.CacheTy, false, + llvm::GlobalValue::ExternalLinkage, nullptr, "_objc_empty_cache"); if (CGM.getTriple().isOSBinFormatCOFF()) - ObjCEmptyCacheVar->setDLLStorageClass(getStorage(CGM, "_objc_empty_cache")); + ObjCEmptyCacheVar->setDLLStorageClass( + getStorage(CGM, "_objc_empty_cache")); // Only OS X with deployment version <10.9 use the empty vtable symbol const llvm::Triple &Triple = CGM.getTarget().getTriple(); if (Triple.isMacOSX() && Triple.isMacOSXVersionLT(10, 9)) - ObjCEmptyVtableVar = - new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ImpnfABITy, false, - llvm::GlobalValue::ExternalLinkage, nullptr, - "_objc_empty_vtable"); + ObjCEmptyVtableVar = new llvm::GlobalVariable( + CGM.getModule(), ObjCTypes.ImpnfABITy, false, + llvm::GlobalValue::ExternalLinkage, nullptr, "_objc_empty_vtable"); else ObjCEmptyVtableVar = llvm::ConstantPointerNull::get(CGM.UnqualPtrTy); } // FIXME: Is this correct (that meta class size is never computed)? uint32_t InstanceStart = - CGM.getDataLayout().getTypeAllocSize(ObjCTypes.ClassnfABITy); + CGM.getDataLayout().getTypeAllocSize(ObjCTypes.ClassnfABITy); uint32_t InstanceSize = InstanceStart; uint32_t flags = NonFragileABI_Class_Meta; @@ -6512,9 +6427,8 @@ void CGObjCNonFragileABIMac::GenerateClass(const ObjCImplementationDecl *ID) { llvm::GlobalVariable *CLASS_RO_GV = BuildClassRoTInitializer(flags, InstanceStart, InstanceSize, ID); - llvm::GlobalVariable *MetaTClass = - BuildClassObject(CI, /*metaclass*/ true, - IsAGV, SuperClassGV, CLASS_RO_GV, classIsHidden); + llvm::GlobalVariable *MetaTClass = BuildClassObject( + CI, /*metaclass*/ true, IsAGV, SuperClassGV, CLASS_RO_GV, classIsHidden); CGM.setGVProperties(MetaTClass, CI); DefinedMetaClasses.push_back(MetaTClass); @@ -6553,8 +6467,8 @@ void CGObjCNonFragileABIMac::GenerateClass(const ObjCImplementationDecl *ID) { BuildClassRoTInitializer(flags, InstanceStart, InstanceSize, ID); llvm::GlobalVariable *ClassMD = - BuildClassObject(CI, /*metaclass*/ false, - MetaTClass, SuperClassGV, CLASS_RO_GV, classIsHidden); + BuildClassObject(CI, /*metaclass*/ false, MetaTClass, SuperClassGV, + CLASS_RO_GV, classIsHidden); CGM.setGVProperties(ClassMD, CI); DefinedClasses.push_back(ClassMD); ImplementedClasses.push_back(CI); @@ -6565,7 +6479,7 @@ void CGObjCNonFragileABIMac::GenerateClass(const ObjCImplementationDecl *ID) { // Force the definition of the EHType if necessary. if (flags & NonFragileABI_Class_Exception) - (void) GetInterfaceEHType(CI, ForDefinition); + (void)GetInterfaceEHType(CI, ForDefinition); // Make sure method definition entries are all clear for next implementation. MethodDefinitions.clear(); } @@ -6578,8 +6492,9 @@ void CGObjCNonFragileABIMac::GenerateClass(const ObjCImplementationDecl *ID) { /// It generates a weak reference to l_OBJC_PROTOCOL_REFERENCE_$_Proto1 /// which will hold address of the protocol meta-data. /// -llvm::Value *CGObjCNonFragileABIMac::GenerateProtocolRef(CodeGenFunction &CGF, - const ObjCProtocolDecl *PD) { +llvm::Value * +CGObjCNonFragileABIMac::GenerateProtocolRef(CodeGenFunction &CGF, + const ObjCProtocolDecl *PD) { // This routine is called for @protocol only. So, we must build definition // of protocol's meta-data (not a reference to it!) @@ -6598,8 +6513,8 @@ llvm::Value *CGObjCNonFragileABIMac::GenerateProtocolRef(CodeGenFunction &CGF, PTGV = new llvm::GlobalVariable(CGM.getModule(), Init->getType(), false, llvm::GlobalValue::WeakAnyLinkage, Init, ProtocolName); - PTGV->setSection(GetSectionName("__objc_protorefs", - "coalesced,no_dead_strip")); + PTGV->setSection( + GetSectionName("__objc_protorefs", "coalesced,no_dead_strip")); PTGV->setVisibility(llvm::GlobalValue::HiddenVisibility); PTGV->setAlignment(Align.getAsAlign()); if (!CGM.getTriple().isOSBinFormatMachO()) @@ -6749,9 +6664,8 @@ void CGObjCNonFragileABIMac::emitMethodConstant(ConstantArrayBuilder &builder, /// struct _objc_method method_list[method_count]; /// } /// -llvm::Constant * -CGObjCNonFragileABIMac::emitMethodList(Twine name, MethodListType kind, - ArrayRef methods) { +llvm::Constant *CGObjCNonFragileABIMac::emitMethodList( + Twine name, MethodListType kind, ArrayRef methods) { // Return null for empty list. if (methods.empty()) return llvm::Constant::getNullValue(ObjCTypes.MethodListnfABIPtrTy); @@ -6824,10 +6738,9 @@ CGObjCNonFragileABIMac::ObjCIvarOffsetVariable(const ObjCInterfaceDecl *ID, Name += Ivar->getName(); llvm::GlobalVariable *IvarOffsetGV = CGM.getModule().getGlobalVariable(Name); if (!IvarOffsetGV) { - IvarOffsetGV = - new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.IvarOffsetVarTy, - false, llvm::GlobalValue::ExternalLinkage, - nullptr, Name.str()); + IvarOffsetGV = new llvm::GlobalVariable( + CGM.getModule(), ObjCTypes.IvarOffsetVarTy, false, + llvm::GlobalValue::ExternalLinkage, nullptr, Name.str()); if (CGM.getTriple().isOSBinFormatCOFF()) { bool IsPrivateOrPackage = Ivar->getAccessControl() == ObjCIvarDecl::Private || @@ -6836,11 +6749,11 @@ CGObjCNonFragileABIMac::ObjCIvarOffsetVariable(const ObjCInterfaceDecl *ID, const ObjCInterfaceDecl *ContainingID = Ivar->getContainingInterface(); if (ContainingID->hasAttr()) - IvarOffsetGV - ->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass); + IvarOffsetGV->setDLLStorageClass( + llvm::GlobalValue::DLLImportStorageClass); else if (ContainingID->hasAttr() && !IsPrivateOrPackage) - IvarOffsetGV - ->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass); + IvarOffsetGV->setDLLStorageClass( + llvm::GlobalValue::DLLExportStorageClass); } } return IvarOffsetGV; @@ -6895,8 +6808,8 @@ CGObjCNonFragileABIMac::EmitIvarOffsetVar(const ObjCInterfaceDecl *ID, /// } /// -llvm::Constant *CGObjCNonFragileABIMac::EmitIvarList( - const ObjCImplementationDecl *ID) { +llvm::Constant * +CGObjCNonFragileABIMac::EmitIvarList(const ObjCImplementationDecl *ID) { ConstantInitBuilder builder(CGM); auto ivarList = builder.beginStruct(); @@ -6910,8 +6823,8 @@ llvm::Constant *CGObjCNonFragileABIMac::EmitIvarList( // FIXME. Consolidate this with similar code in GenerateClass. - for (const ObjCIvarDecl *IVD = OID->all_declared_ivar_begin(); - IVD; IVD = IVD->getNextIvar()) { + for (const ObjCIvarDecl *IVD = OID->all_declared_ivar_begin(); IVD; + IVD = IVD->getNextIvar()) { // Ignore unnamed bit-fields. if (!IVD->getDeclName()) continue; @@ -6921,11 +6834,11 @@ llvm::Constant *CGObjCNonFragileABIMac::EmitIvarList( ComputeIvarBaseOffset(CGM, ID, IVD))); ivar.add(GetMethodVarName(IVD->getIdentifier())); ivar.add(GetMethodVarType(IVD)); - llvm::Type *FieldTy = - CGM.getTypes().ConvertTypeForMem(IVD->getType()); + llvm::Type *FieldTy = CGM.getTypes().ConvertTypeForMem(IVD->getType()); unsigned Size = CGM.getDataLayout().getTypeAllocSize(FieldTy); - unsigned Align = CGM.getContext().getPreferredTypeAlign( - IVD->getType().getTypePtr()) >> 3; + unsigned Align = + CGM.getContext().getPreferredTypeAlign(IVD->getType().getTypePtr()) >> + 3; Align = llvm::Log2_32(Align); ivar.addInt(ObjCTypes.IntTy, Align); // NOTE. Size of a bitfield does not match gcc's, because of the @@ -6954,8 +6867,8 @@ llvm::Constant *CGObjCNonFragileABIMac::EmitIvarList( return GV; } -llvm::Constant *CGObjCNonFragileABIMac::GetOrEmitProtocolRef( - const ObjCProtocolDecl *PD) { +llvm::Constant * +CGObjCNonFragileABIMac::GetOrEmitProtocolRef(const ObjCProtocolDecl *PD) { llvm::GlobalVariable *&Entry = Protocols[PD->getIdentifier()]; assert(!PD->isNonRuntimeProtocol() && @@ -6965,8 +6878,8 @@ llvm::Constant *CGObjCNonFragileABIMac::GetOrEmitProtocolRef( // reference or not. At module finalization we add the empty // contents for protocols which were referenced but never defined. llvm::SmallString<64> Protocol; - llvm::raw_svector_ostream(Protocol) << "_OBJC_PROTOCOL_$_" - << PD->getObjCRuntimeNameAsString(); + llvm::raw_svector_ostream(Protocol) + << "_OBJC_PROTOCOL_$_" << PD->getObjCRuntimeNameAsString(); Entry = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ProtocolnfABITy, false, llvm::GlobalValue::ExternalLinkage, @@ -6998,8 +6911,8 @@ llvm::Constant *CGObjCNonFragileABIMac::GetOrEmitProtocolRef( /// @endcode /// -llvm::Constant *CGObjCNonFragileABIMac::GetOrEmitProtocol( - const ObjCProtocolDecl *PD) { +llvm::Constant * +CGObjCNonFragileABIMac::GetOrEmitProtocol(const ObjCProtocolDecl *PD) { llvm::GlobalVariable *Entry = Protocols[PD->getIdentifier()]; // Early exit if a defining object has already been generated. @@ -7019,36 +6932,34 @@ llvm::Constant *CGObjCNonFragileABIMac::GetOrEmitProtocol( // isa is NULL values.addNullPointer(ObjCTypes.ObjectPtrTy); values.add(GetClassName(PD->getObjCRuntimeNameAsString())); - values.add(EmitProtocolList("_OBJC_$_PROTOCOL_REFS_" - + PD->getObjCRuntimeNameAsString(), - PD->protocol_begin(), - PD->protocol_end())); - values.add(methodLists.emitMethodList(this, PD, - ProtocolMethodLists::RequiredInstanceMethods)); - values.add(methodLists.emitMethodList(this, PD, - ProtocolMethodLists::RequiredClassMethods)); - values.add(methodLists.emitMethodList(this, PD, - ProtocolMethodLists::OptionalInstanceMethods)); - values.add(methodLists.emitMethodList(this, PD, - ProtocolMethodLists::OptionalClassMethods)); - values.add(EmitPropertyList( - "_OBJC_$_PROP_LIST_" + PD->getObjCRuntimeNameAsString(), - nullptr, PD, ObjCTypes, false)); + values.add(EmitProtocolList("_OBJC_$_PROTOCOL_REFS_" + + PD->getObjCRuntimeNameAsString(), + PD->protocol_begin(), PD->protocol_end())); + values.add(methodLists.emitMethodList( + this, PD, ProtocolMethodLists::RequiredInstanceMethods)); + values.add(methodLists.emitMethodList( + this, PD, ProtocolMethodLists::RequiredClassMethods)); + values.add(methodLists.emitMethodList( + this, PD, ProtocolMethodLists::OptionalInstanceMethods)); + values.add(methodLists.emitMethodList( + this, PD, ProtocolMethodLists::OptionalClassMethods)); + values.add( + EmitPropertyList("_OBJC_$_PROP_LIST_" + PD->getObjCRuntimeNameAsString(), + nullptr, PD, ObjCTypes, false)); uint32_t Size = - CGM.getDataLayout().getTypeAllocSize(ObjCTypes.ProtocolnfABITy); + CGM.getDataLayout().getTypeAllocSize(ObjCTypes.ProtocolnfABITy); values.addInt(ObjCTypes.IntTy, Size); values.addInt(ObjCTypes.IntTy, 0); - values.add(EmitProtocolMethodTypes("_OBJC_$_PROTOCOL_METHOD_TYPES_" - + PD->getObjCRuntimeNameAsString(), - methodLists.emitExtendedTypesArray(this), - ObjCTypes)); + values.add(EmitProtocolMethodTypes( + "_OBJC_$_PROTOCOL_METHOD_TYPES_" + PD->getObjCRuntimeNameAsString(), + methodLists.emitExtendedTypesArray(this), ObjCTypes)); // const char *demangledName; values.addNullPointer(ObjCTypes.Int8PtrTy); - values.add(EmitPropertyList( - "_OBJC_$_CLASS_PROP_LIST_" + PD->getObjCRuntimeNameAsString(), - nullptr, PD, ObjCTypes, true)); + values.add(EmitPropertyList("_OBJC_$_CLASS_PROP_LIST_" + + PD->getObjCRuntimeNameAsString(), + nullptr, PD, ObjCTypes, true)); if (Entry) { // Already created, fix the linkage and update the initializer. @@ -7057,7 +6968,7 @@ llvm::Constant *CGObjCNonFragileABIMac::GetOrEmitProtocol( } else { llvm::SmallString<64> symbolName; llvm::raw_svector_ostream(symbolName) - << "_OBJC_PROTOCOL_$_" << PD->getObjCRuntimeNameAsString(); + << "_OBJC_PROTOCOL_$_" << PD->getObjCRuntimeNameAsString(); Entry = values.finishAndCreateGlobal(symbolName, CGM.getPointerAlign(), /*constant*/ false, @@ -7073,19 +6984,18 @@ llvm::Constant *CGObjCNonFragileABIMac::GetOrEmitProtocol( // Use this protocol meta-data to build protocol list table in section // __DATA, __objc_protolist llvm::SmallString<64> ProtocolRef; - llvm::raw_svector_ostream(ProtocolRef) << "_OBJC_LABEL_PROTOCOL_$_" - << PD->getObjCRuntimeNameAsString(); + llvm::raw_svector_ostream(ProtocolRef) + << "_OBJC_LABEL_PROTOCOL_$_" << PD->getObjCRuntimeNameAsString(); - llvm::GlobalVariable *PTGV = - new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.ProtocolnfABIPtrTy, - false, llvm::GlobalValue::WeakAnyLinkage, Entry, - ProtocolRef); + llvm::GlobalVariable *PTGV = new llvm::GlobalVariable( + CGM.getModule(), ObjCTypes.ProtocolnfABIPtrTy, false, + llvm::GlobalValue::WeakAnyLinkage, Entry, ProtocolRef); if (!CGM.getTriple().isOSBinFormatMachO()) PTGV->setComdat(CGM.getModule().getOrInsertComdat(ProtocolRef)); PTGV->setAlignment( CGM.getDataLayout().getABITypeAlign(ObjCTypes.ProtocolnfABIPtrTy)); - PTGV->setSection(GetSectionName("__objc_protolist", - "coalesced,no_dead_strip")); + PTGV->setSection( + GetSectionName("__objc_protolist", "coalesced,no_dead_strip")); PTGV->setVisibility(llvm::GlobalValue::HiddenVisibility); CGM.addUsedGlobal(PTGV); return Entry; @@ -7099,10 +7009,9 @@ llvm::Constant *CGObjCNonFragileABIMac::GetOrEmitProtocol( /// } /// @endcode /// -llvm::Constant * -CGObjCNonFragileABIMac::EmitProtocolList(Twine Name, - ObjCProtocolDecl::protocol_iterator begin, - ObjCProtocolDecl::protocol_iterator end) { +llvm::Constant *CGObjCNonFragileABIMac::EmitProtocolList( + Twine Name, ObjCProtocolDecl::protocol_iterator begin, + ObjCProtocolDecl::protocol_iterator end) { // Just return null for empty protocol lists auto Protocols = GetRuntimeProtocolList(begin, end); if (Protocols.empty()) @@ -7123,7 +7032,7 @@ CGObjCNonFragileABIMac::EmitProtocolList(Twine Name, SmallString<256> TmpName; Name.toVector(TmpName); llvm::GlobalVariable *GV = - CGM.getModule().getGlobalVariable(TmpName.str(), true); + CGM.getModule().getGlobalVariable(TmpName.str(), true); if (GV) return GV; @@ -7153,11 +7062,8 @@ CGObjCNonFragileABIMac::EmitProtocolList(Twine Name, /// @encode /// LValue CGObjCNonFragileABIMac::EmitObjCValueForIvar( - CodeGen::CodeGenFunction &CGF, - QualType ObjectTy, - llvm::Value *BaseValue, - const ObjCIvarDecl *Ivar, - unsigned CVRQualifiers) { + CodeGen::CodeGenFunction &CGF, QualType ObjectTy, llvm::Value *BaseValue, + const ObjCIvarDecl *Ivar, unsigned CVRQualifiers) { ObjCInterfaceDecl *ID = ObjectTy->castAs()->getInterface(); llvm::Value *Offset = EmitIvarOffset(CGF, ID, Ivar); return EmitValueForIvarAtOffset(CGF, ID, BaseValue, Ivar, CVRQualifiers, @@ -7175,9 +7081,8 @@ CGObjCNonFragileABIMac::EmitIvarOffset(CodeGen::CodeGenFunction &CGF, ComputeIvarBaseOffset(CGM, Interface->getImplementation(), Ivar)); } else { llvm::GlobalVariable *GV = ObjCIvarOffsetVariable(Interface, Ivar); - IvarOffsetValue = - CGF.Builder.CreateAlignedLoad(GV->getValueType(), GV, - CGF.getSizeAlign(), "ivar"); + IvarOffsetValue = CGF.Builder.CreateAlignedLoad(GV->getValueType(), GV, + CGF.getSizeAlign(), "ivar"); if (IsIvarOffsetKnownIdempotent(CGF, Ivar)) cast(IvarOffsetValue) ->setMetadata(llvm::LLVMContext::MD_invariant_load, @@ -7216,16 +7121,10 @@ static void appendSelectorForMessageRefTable(std::string &buffer, /// appropriate vtable slot, and if not, it substitues a stub function /// which tail-calls objc_msgSend. Both stubs adjust the selector /// argument to correctly point to the selector. -RValue -CGObjCNonFragileABIMac::EmitVTableMessageSend(CodeGenFunction &CGF, - ReturnValueSlot returnSlot, - QualType resultType, - Selector selector, - llvm::Value *arg0, - QualType arg0Type, - bool isSuper, - const CallArgList &formalArgs, - const ObjCMethodDecl *method) { +RValue CGObjCNonFragileABIMac::EmitVTableMessageSend( + CodeGenFunction &CGF, ReturnValueSlot returnSlot, QualType resultType, + Selector selector, llvm::Value *arg0, QualType arg0Type, bool isSuper, + const CallArgList &formalArgs, const ObjCMethodDecl *method) { // Compute the actual arguments. CallArgList args; @@ -7281,18 +7180,17 @@ CGObjCNonFragileABIMac::EmitVTableMessageSend(CodeGenFunction &CGF, // would have used colons. appendSelectorForMessageRefTable(messageRefName, selector); - llvm::GlobalVariable *messageRef - = CGM.getModule().getGlobalVariable(messageRefName); + llvm::GlobalVariable *messageRef = + CGM.getModule().getGlobalVariable(messageRefName); if (!messageRef) { // Build the message ref structure. ConstantInitBuilder builder(CGM); auto values = builder.beginStruct(); values.add(cast(fn.getCallee())); values.add(GetMethodVarName(selector)); - messageRef = values.finishAndCreateGlobal(messageRefName, - CharUnits::fromQuantity(16), - /*constant*/ false, - llvm::GlobalValue::WeakAnyLinkage); + messageRef = values.finishAndCreateGlobal( + messageRefName, CharUnits::fromQuantity(16), + /*constant*/ false, llvm::GlobalValue::WeakAnyLinkage); messageRef->setVisibility(llvm::GlobalValue::HiddenVisibility); messageRef->setSection(GetSectionName("__objc_msgrefs", "coalesced")); } @@ -7309,8 +7207,8 @@ CGObjCNonFragileABIMac::EmitVTableMessageSend(CodeGenFunction &CGF, } Address mref = - Address(CGF.Builder.CreateBitCast(messageRef, ObjCTypes.MessageRefPtrTy), - ObjCTypes.MessageRefTy, CGF.getPointerAlign()); + Address(CGF.Builder.CreateBitCast(messageRef, ObjCTypes.MessageRefPtrTy), + ObjCTypes.MessageRefTy, CGF.getPointerAlign()); // Update the message ref argument. args[1].setRValue(RValue::get(mref, CGF)); @@ -7328,22 +7226,17 @@ CGObjCNonFragileABIMac::EmitVTableMessageSend(CodeGenFunction &CGF, } /// Generate code for a message send expression in the nonfragile abi. -CodeGen::RValue -CGObjCNonFragileABIMac::GenerateMessageSend(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot Return, - QualType ResultType, - Selector Sel, - llvm::Value *Receiver, - const CallArgList &CallArgs, - const ObjCInterfaceDecl *Class, - const ObjCMethodDecl *Method) { +CodeGen::RValue CGObjCNonFragileABIMac::GenerateMessageSend( + CodeGen::CodeGenFunction &CGF, ReturnValueSlot Return, QualType ResultType, + Selector Sel, llvm::Value *Receiver, const CallArgList &CallArgs, + const ObjCInterfaceDecl *Class, const ObjCMethodDecl *Method) { return isVTableDispatchedSelector(Sel) - ? EmitVTableMessageSend(CGF, Return, ResultType, Sel, - Receiver, CGF.getContext().getObjCIdType(), - false, CallArgs, Method) - : EmitMessageSend(CGF, Return, ResultType, Sel, - Receiver, CGF.getContext().getObjCIdType(), - false, CallArgs, Method, Class, ObjCTypes); + ? EmitVTableMessageSend(CGF, Return, ResultType, Sel, Receiver, + CGF.getContext().getObjCIdType(), false, + CallArgs, Method) + : EmitMessageSend(CGF, Return, ResultType, Sel, Receiver, + CGF.getContext().getObjCIdType(), false, + CallArgs, Method, Class, ObjCTypes); } llvm::Constant * @@ -7351,13 +7244,12 @@ CGObjCNonFragileABIMac::GetClassGlobal(const ObjCInterfaceDecl *ID, bool metaclass, ForDefinition_t isForDefinition) { auto prefix = - (metaclass ? getMetaclassSymbolPrefix() : getClassSymbolPrefix()); + (metaclass ? getMetaclassSymbolPrefix() : getClassSymbolPrefix()); return GetClassGlobal((prefix + ID->getObjCRuntimeNameAsString()).str(), - isForDefinition, - ID->isWeakImported(), - !isForDefinition - && CGM.getTriple().isOSBinFormatCOFF() - && ID->hasAttr()); + isForDefinition, ID->isWeakImported(), + !isForDefinition && + CGM.getTriple().isOSBinFormatCOFF() && + ID->hasAttr()); } llvm::Constant * @@ -7390,8 +7282,8 @@ CGObjCNonFragileABIMac::GetClassGlobal(StringRef Name, llvm::Constant * CGObjCNonFragileABIMac::GetClassGlobalForClassRef(const ObjCInterfaceDecl *ID) { - llvm::Constant *ClassGV = GetClassGlobal(ID, /*metaclass*/ false, - NotForDefinition); + llvm::Constant *ClassGV = + GetClassGlobal(ID, /*metaclass*/ false, NotForDefinition); if (!ID->hasAttr()) return ClassGV; @@ -7411,18 +7303,16 @@ CGObjCNonFragileABIMac::EmitLoadOfClassRef(CodeGenFunction &CGF, if (ID && ID->hasAttr()) { // Classrefs pointing at Objective-C stub classes must be loaded by calling // a special runtime function. - return CGF.EmitRuntimeCall( - ObjCTypes.getLoadClassrefFn(), Entry, "load_classref_result"); + return CGF.EmitRuntimeCall(ObjCTypes.getLoadClassrefFn(), Entry, + "load_classref_result"); } CharUnits Align = CGF.getPointerAlign(); return CGF.Builder.CreateAlignedLoad(Entry->getValueType(), Entry, Align); } -llvm::Value * -CGObjCNonFragileABIMac::EmitClassRefFromId(CodeGenFunction &CGF, - IdentifierInfo *II, - const ObjCInterfaceDecl *ID) { +llvm::Value *CGObjCNonFragileABIMac::EmitClassRefFromId( + CodeGenFunction &CGF, IdentifierInfo *II, const ObjCInterfaceDecl *ID) { llvm::GlobalVariable *&Entry = ClassReferences[II]; if (!Entry) { @@ -7462,8 +7352,8 @@ llvm::Value *CGObjCNonFragileABIMac::EmitClassRef(CodeGenFunction &CGF, return EmitClassRefFromId(CGF, ID->getIdentifier(), ID); } -llvm::Value *CGObjCNonFragileABIMac::EmitNSAutoreleasePoolClassRef( - CodeGenFunction &CGF) { +llvm::Value * +CGObjCNonFragileABIMac::EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF) { IdentifierInfo *II = &CGM.getContext().Idents.get("NSAutoreleasePool"); return EmitClassRefFromId(CGF, II, nullptr); } @@ -7491,11 +7381,10 @@ CGObjCNonFragileABIMac::EmitSuperClassRef(CodeGenFunction &CGF, /// EmitMetaClassRef - Return a Value * of the address of _class_t /// meta-data /// -llvm::Value *CGObjCNonFragileABIMac::EmitMetaClassRef(CodeGenFunction &CGF, - const ObjCInterfaceDecl *ID, - bool Weak) { +llvm::Value *CGObjCNonFragileABIMac::EmitMetaClassRef( + CodeGenFunction &CGF, const ObjCInterfaceDecl *ID, bool Weak) { CharUnits Align = CGF.getPointerAlign(); - llvm::GlobalVariable * &Entry = MetaClassReferences[ID->getIdentifier()]; + llvm::GlobalVariable *&Entry = MetaClassReferences[ID->getIdentifier()]; if (!Entry) { auto MetaClassGV = GetClassGlobal(ID, /*metaclass*/ true, NotForDefinition); std::string SectionName = @@ -7528,17 +7417,11 @@ llvm::Value *CGObjCNonFragileABIMac::GetClass(CodeGenFunction &CGF, /// Generates a message send where the super is the receiver. This is /// a message send to self with special delivery semantics indicating /// which class's method should be called. -CodeGen::RValue -CGObjCNonFragileABIMac::GenerateMessageSendSuper(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot Return, - QualType ResultType, - Selector Sel, - const ObjCInterfaceDecl *Class, - bool isCategoryImpl, - llvm::Value *Receiver, - bool IsClassMessage, - const CodeGen::CallArgList &CallArgs, - const ObjCMethodDecl *Method) { +CodeGen::RValue CGObjCNonFragileABIMac::GenerateMessageSendSuper( + CodeGen::CodeGenFunction &CGF, ReturnValueSlot Return, QualType ResultType, + Selector Sel, const ObjCInterfaceDecl *Class, bool isCategoryImpl, + llvm::Value *Receiver, bool IsClassMessage, + const CodeGen::CallArgList &CallArgs, const ObjCMethodDecl *Method) { // ... // Create and init a super structure; this is a (receiver, class) // pair we will pass to objc_msgSendSuper. @@ -7546,38 +7429,38 @@ CGObjCNonFragileABIMac::GenerateMessageSendSuper(CodeGen::CodeGenFunction &CGF, ObjCTypes.SuperTy, CGF.getPointerAlign(), "objc_super"); llvm::Value *ReceiverAsObject = - CGF.Builder.CreateBitCast(Receiver, ObjCTypes.ObjectPtrTy); + CGF.Builder.CreateBitCast(Receiver, ObjCTypes.ObjectPtrTy); CGF.Builder.CreateStore(ReceiverAsObject, CGF.Builder.CreateStructGEP(ObjCSuper, 0)); // If this is a class message the metaclass is passed as the target. llvm::Value *Target; if (IsClassMessage) - Target = EmitMetaClassRef(CGF, Class, Class->isWeakImported()); + Target = EmitMetaClassRef(CGF, Class, Class->isWeakImported()); else Target = EmitSuperClassRef(CGF, Class); // FIXME: We shouldn't need to do this cast, rectify the ASTContext and // ObjCTypes types. llvm::Type *ClassTy = - CGM.getTypes().ConvertType(CGF.getContext().getObjCClassType()); + CGM.getTypes().ConvertType(CGF.getContext().getObjCClassType()); Target = CGF.Builder.CreateBitCast(Target, ClassTy); CGF.Builder.CreateStore(Target, CGF.Builder.CreateStructGEP(ObjCSuper, 1)); return (isVTableDispatchedSelector(Sel)) - ? EmitVTableMessageSend(CGF, Return, ResultType, Sel, - ObjCSuper.getPointer(), ObjCTypes.SuperPtrCTy, - true, CallArgs, Method) - : EmitMessageSend(CGF, Return, ResultType, Sel, - ObjCSuper.getPointer(), ObjCTypes.SuperPtrCTy, - true, CallArgs, Method, Class, ObjCTypes); + ? EmitVTableMessageSend( + CGF, Return, ResultType, Sel, ObjCSuper.getPointer(), + ObjCTypes.SuperPtrCTy, true, CallArgs, Method) + : EmitMessageSend(CGF, Return, ResultType, Sel, + ObjCSuper.getPointer(), ObjCTypes.SuperPtrCTy, + true, CallArgs, Method, Class, ObjCTypes); } llvm::Value *CGObjCNonFragileABIMac::EmitSelector(CodeGenFunction &CGF, Selector Sel) { Address Addr = EmitSelectorAddr(Sel); - llvm::LoadInst* LI = CGF.Builder.CreateLoad(Addr); + llvm::LoadInst *LI = CGF.Builder.CreateLoad(Addr); LI->setMetadata(llvm::LLVMContext::MD_invariant_load, llvm::MDNode::get(VMContext, {})); return LI; @@ -7606,15 +7489,14 @@ ConstantAddress CGObjCNonFragileABIMac::EmitSelectorAddr(Selector Sel) { /// objc_assign_ivar (id src, id *dst, ptrdiff_t) /// void CGObjCNonFragileABIMac::EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF, - llvm::Value *src, - Address dst, + llvm::Value *src, Address dst, llvm::Value *ivarOffset) { - llvm::Type * SrcTy = src->getType(); + llvm::Type *SrcTy = src->getType(); if (!isa(SrcTy)) { unsigned Size = CGM.getDataLayout().getTypeAllocSize(SrcTy); assert(Size <= 8 && "does not support size > 8"); src = (Size == 4 ? CGF.Builder.CreateBitCast(src, ObjCTypes.IntTy) - : CGF.Builder.CreateBitCast(src, ObjCTypes.LongTy)); + : CGF.Builder.CreateBitCast(src, ObjCTypes.LongTy)); src = CGF.Builder.CreateIntToPtr(src, ObjCTypes.Int8PtrTy); } src = CGF.Builder.CreateBitCast(src, ObjCTypes.ObjectPtrTy); @@ -7628,22 +7510,21 @@ void CGObjCNonFragileABIMac::EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF, /// objc_assign_strongCast (id src, id *dst) /// void CGObjCNonFragileABIMac::EmitObjCStrongCastAssign( - CodeGen::CodeGenFunction &CGF, - llvm::Value *src, Address dst) { - llvm::Type * SrcTy = src->getType(); + CodeGen::CodeGenFunction &CGF, llvm::Value *src, Address dst) { + llvm::Type *SrcTy = src->getType(); if (!isa(SrcTy)) { unsigned Size = CGM.getDataLayout().getTypeAllocSize(SrcTy); assert(Size <= 8 && "does not support size > 8"); src = (Size == 4 ? CGF.Builder.CreateBitCast(src, ObjCTypes.IntTy) - : CGF.Builder.CreateBitCast(src, ObjCTypes.LongTy)); + : CGF.Builder.CreateBitCast(src, ObjCTypes.LongTy)); src = CGF.Builder.CreateIntToPtr(src, ObjCTypes.Int8PtrTy); } src = CGF.Builder.CreateBitCast(src, ObjCTypes.ObjectPtrTy); llvm::Value *dstVal = CGF.Builder.CreateBitCast(dst.emitRawPointer(CGF), ObjCTypes.PtrObjectPtrTy); llvm::Value *args[] = {src, dstVal}; - CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignStrongCastFn(), - args, "weakassign"); + CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignStrongCastFn(), args, + "weakassign"); } void CGObjCNonFragileABIMac::EmitGCMemmoveCollectable( @@ -7657,15 +7538,14 @@ void CGObjCNonFragileABIMac::EmitGCMemmoveCollectable( /// EmitObjCWeakRead - Code gen for loading value of a __weak /// object: objc_read_weak (id *src) /// -llvm::Value * CGObjCNonFragileABIMac::EmitObjCWeakRead( - CodeGen::CodeGenFunction &CGF, - Address AddrWeakObj) { +llvm::Value * +CGObjCNonFragileABIMac::EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF, + Address AddrWeakObj) { llvm::Type *DestTy = AddrWeakObj.getElementType(); llvm::Value *AddrWeakObjVal = CGF.Builder.CreateBitCast( AddrWeakObj.emitRawPointer(CGF), ObjCTypes.PtrObjectPtrTy); - llvm::Value *read_weak = - CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcReadWeakFn(), - AddrWeakObjVal, "weakread"); + llvm::Value *read_weak = CGF.EmitNounwindRuntimeCall( + ObjCTypes.getGcReadWeakFn(), AddrWeakObjVal, "weakread"); read_weak = CGF.Builder.CreateBitCast(read_weak, DestTy); return read_weak; } @@ -7675,34 +7555,34 @@ llvm::Value * CGObjCNonFragileABIMac::EmitObjCWeakRead( /// void CGObjCNonFragileABIMac::EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, Address dst) { - llvm::Type * SrcTy = src->getType(); + llvm::Type *SrcTy = src->getType(); if (!isa(SrcTy)) { unsigned Size = CGM.getDataLayout().getTypeAllocSize(SrcTy); assert(Size <= 8 && "does not support size > 8"); src = (Size == 4 ? CGF.Builder.CreateBitCast(src, ObjCTypes.IntTy) - : CGF.Builder.CreateBitCast(src, ObjCTypes.LongTy)); + : CGF.Builder.CreateBitCast(src, ObjCTypes.LongTy)); src = CGF.Builder.CreateIntToPtr(src, ObjCTypes.Int8PtrTy); } src = CGF.Builder.CreateBitCast(src, ObjCTypes.ObjectPtrTy); llvm::Value *dstVal = CGF.Builder.CreateBitCast(dst.emitRawPointer(CGF), ObjCTypes.PtrObjectPtrTy); llvm::Value *args[] = {src, dstVal}; - CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignWeakFn(), - args, "weakassign"); + CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignWeakFn(), args, + "weakassign"); } /// EmitObjCGlobalAssign - Code gen for assigning to a __strong object. /// objc_assign_global (id src, id *dst) /// void CGObjCNonFragileABIMac::EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF, - llvm::Value *src, Address dst, - bool threadlocal) { - llvm::Type * SrcTy = src->getType(); + llvm::Value *src, Address dst, + bool threadlocal) { + llvm::Type *SrcTy = src->getType(); if (!isa(SrcTy)) { unsigned Size = CGM.getDataLayout().getTypeAllocSize(SrcTy); assert(Size <= 8 && "does not support size > 8"); src = (Size == 4 ? CGF.Builder.CreateBitCast(src, ObjCTypes.IntTy) - : CGF.Builder.CreateBitCast(src, ObjCTypes.LongTy)); + : CGF.Builder.CreateBitCast(src, ObjCTypes.LongTy)); src = CGF.Builder.CreateIntToPtr(src, ObjCTypes.Int8PtrTy); } src = CGF.Builder.CreateBitCast(src, ObjCTypes.ObjectPtrTy); @@ -7710,30 +7590,27 @@ void CGObjCNonFragileABIMac::EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF, ObjCTypes.PtrObjectPtrTy); llvm::Value *args[] = {src, dstVal}; if (!threadlocal) - CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignGlobalFn(), - args, "globalassign"); + CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignGlobalFn(), args, + "globalassign"); else - CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignThreadLocalFn(), - args, "threadlocalassign"); + CGF.EmitNounwindRuntimeCall(ObjCTypes.getGcAssignThreadLocalFn(), args, + "threadlocalassign"); } -void -CGObjCNonFragileABIMac::EmitSynchronizedStmt(CodeGen::CodeGenFunction &CGF, - const ObjCAtSynchronizedStmt &S) { +void CGObjCNonFragileABIMac::EmitSynchronizedStmt( + CodeGen::CodeGenFunction &CGF, const ObjCAtSynchronizedStmt &S) { EmitAtSynchronizedStmt(CGF, S, ObjCTypes.getSyncEnterFn(), ObjCTypes.getSyncExitFn()); } -llvm::Constant * -CGObjCNonFragileABIMac::GetEHType(QualType T) { +llvm::Constant *CGObjCNonFragileABIMac::GetEHType(QualType T) { // There's a particular fixed type info for 'id'. if (T->isObjCIdType() || T->isObjCQualifiedIdType()) { auto *IDEHType = CGM.getModule().getGlobalVariable("OBJC_EHTYPE_id"); if (!IDEHType) { - IDEHType = - new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.EHTypeTy, false, - llvm::GlobalValue::ExternalLinkage, nullptr, - "OBJC_EHTYPE_id"); + IDEHType = new llvm::GlobalVariable( + CGM.getModule(), ObjCTypes.EHTypeTy, false, + llvm::GlobalValue::ExternalLinkage, nullptr, "OBJC_EHTYPE_id"); if (CGM.getTriple().isOSBinFormatCOFF()) IDEHType->setDLLStorageClass(getStorage(CGM, "OBJC_EHTYPE_id")); } @@ -7781,7 +7658,7 @@ void CGObjCNonFragileABIMac::EmitThrowStmt(CodeGen::CodeGenFunction &CGF, llvm::Constant * CGObjCNonFragileABIMac::GetInterfaceEHType(const ObjCInterfaceDecl *ID, ForDefinition_t IsForDefinition) { - llvm::GlobalVariable * &Entry = EHTypeReferences[ID->getIdentifier()]; + llvm::GlobalVariable *&Entry = EHTypeReferences[ID->getIdentifier()]; StringRef ClassName = ID->getObjCRuntimeNameAsString(); // If we don't need a definition, return the entry if found or check @@ -7794,9 +7671,9 @@ CGObjCNonFragileABIMac::GetInterfaceEHType(const ObjCInterfaceDecl *ID, // attribute, emit an external reference. if (hasObjCExceptionAttribute(CGM.getContext(), ID)) { std::string EHTypeName = ("OBJC_EHTYPE_$_" + ClassName).str(); - Entry = new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.EHTypeTy, - false, llvm::GlobalValue::ExternalLinkage, - nullptr, EHTypeName); + Entry = new llvm::GlobalVariable( + CGM.getModule(), ObjCTypes.EHTypeTy, false, + llvm::GlobalValue::ExternalLinkage, nullptr, EHTypeName); CGM.setGVProperties(Entry, ID); return Entry; } @@ -7808,10 +7685,9 @@ CGObjCNonFragileABIMac::GetInterfaceEHType(const ObjCInterfaceDecl *ID, std::string VTableName = "objc_ehtype_vtable"; auto *VTableGV = CGM.getModule().getGlobalVariable(VTableName); if (!VTableGV) { - VTableGV = - new llvm::GlobalVariable(CGM.getModule(), ObjCTypes.Int8PtrTy, false, - llvm::GlobalValue::ExternalLinkage, nullptr, - VTableName); + VTableGV = new llvm::GlobalVariable( + CGM.getModule(), ObjCTypes.Int8PtrTy, false, + llvm::GlobalValue::ExternalLinkage, nullptr, VTableName); if (CGM.getTriple().isOSBinFormatCOFF()) VTableGV->setDLLStorageClass(getStorage(CGM, VTableName)); } @@ -7819,9 +7695,8 @@ CGObjCNonFragileABIMac::GetInterfaceEHType(const ObjCInterfaceDecl *ID, llvm::Value *VTableIdx = llvm::ConstantInt::get(CGM.Int32Ty, 2); ConstantInitBuilder builder(CGM); auto values = builder.beginStruct(ObjCTypes.EHTypeTy); - values.add( - llvm::ConstantExpr::getInBoundsGetElementPtr(VTableGV->getValueType(), - VTableGV, VTableIdx)); + values.add(llvm::ConstantExpr::getInBoundsGetElementPtr( + VTableGV->getValueType(), VTableGV, VTableIdx)); values.add(GetClassName(ClassName)); values.add(GetClassGlobal(ID, /*metaclass*/ false, NotForDefinition)); @@ -7834,8 +7709,7 @@ CGObjCNonFragileABIMac::GetInterfaceEHType(const ObjCInterfaceDecl *ID, } else { Entry = values.finishAndCreateGlobal("OBJC_EHTYPE_$_" + ClassName, CGM.getPointerAlign(), - /*constant*/ false, - L); + /*constant*/ false, L); if (hasObjCExceptionAttribute(CGM.getContext(), ID)) CGM.setGVProperties(Entry, ID); } @@ -7858,7 +7732,7 @@ CodeGen::CGObjCRuntime * CodeGen::CreateMacObjCRuntime(CodeGen::CodeGenModule &CGM) { switch (CGM.getLangOpts().ObjCRuntime.getKind()) { case ObjCRuntime::FragileMacOSX: - return new CGObjCMac(CGM); + return new CGObjCMac(CGM); case ObjCRuntime::MacOSX: case ObjCRuntime::iOS: diff --git a/clang/lib/CodeGen/CGObjCRuntime.h b/clang/lib/CodeGen/CGObjCRuntime.h index 3bd981256f475..72997bf6348ae 100644 --- a/clang/lib/CodeGen/CGObjCRuntime.h +++ b/clang/lib/CodeGen/CGObjCRuntime.h @@ -23,40 +23,40 @@ #include "llvm/ADT/UniqueVector.h" namespace llvm { - class Constant; - class Function; - class Module; - class StructLayout; - class StructType; - class Type; - class Value; -} +class Constant; +class Function; +class Module; +class StructLayout; +class StructType; +class Type; +class Value; +} // namespace llvm namespace clang { namespace CodeGen { class CGFunctionInfo; class CodeGenFunction; -} - - class FieldDecl; - class ObjCAtTryStmt; - class ObjCAtThrowStmt; - class ObjCAtSynchronizedStmt; - class ObjCContainerDecl; - class ObjCCategoryImplDecl; - class ObjCImplementationDecl; - class ObjCInterfaceDecl; - class ObjCMessageExpr; - class ObjCMethodDecl; - class ObjCProtocolDecl; - class Selector; - class ObjCIvarDecl; - class ObjCStringLiteral; - class BlockDeclRefExpr; +} // namespace CodeGen + +class FieldDecl; +class ObjCAtTryStmt; +class ObjCAtThrowStmt; +class ObjCAtSynchronizedStmt; +class ObjCContainerDecl; +class ObjCCategoryImplDecl; +class ObjCImplementationDecl; +class ObjCInterfaceDecl; +class ObjCMessageExpr; +class ObjCMethodDecl; +class ObjCProtocolDecl; +class Selector; +class ObjCIvarDecl; +class ObjCStringLiteral; +class BlockDeclRefExpr; namespace CodeGen { - class CodeGenModule; - class CGBlockInfo; +class CodeGenModule; +class CGBlockInfo; // FIXME: Several methods should be pure virtual but aren't to avoid the // partially-implemented subclass breaking. @@ -88,8 +88,7 @@ class CGObjCRuntime { const ObjCInterfaceDecl *OID, llvm::Value *BaseValue, const ObjCIvarDecl *Ivar, - unsigned CVRQualifiers, - llvm::Value *Offset); + unsigned CVRQualifiers, llvm::Value *Offset); /// Emits a try / catch statement. This function is intended to be called by /// subclasses, and provides a generic mechanism for generating these, which /// should be usable by all runtimes. The caller must provide the functions @@ -145,7 +144,7 @@ class CGObjCRuntime { /// error to Sema. virtual llvm::Constant *GetEHType(QualType T) = 0; - virtual CatchTypeInfo getCatchAllTypeInfo() { return { nullptr, 0 }; } + virtual CatchTypeInfo getCatchAllTypeInfo() { return {nullptr, 0}; } /// Generate a constant string object. virtual ConstantAddress GenerateConstantString(const StringLiteral *) = 0; @@ -165,11 +164,8 @@ class CGObjCRuntime { /// \param Method - The method being called, this may be null if synthesizing /// a property setter or getter. virtual CodeGen::RValue - GenerateMessageSend(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot ReturnSlot, - QualType ResultType, - Selector Sel, - llvm::Value *Receiver, + GenerateMessageSend(CodeGen::CodeGenFunction &CGF, ReturnValueSlot ReturnSlot, + QualType ResultType, Selector Sel, llvm::Value *Receiver, const CallArgList &CallArgs, const ObjCInterfaceDecl *Class = nullptr, const ObjCMethodDecl *Method = nullptr) = 0; @@ -178,16 +174,11 @@ class CGObjCRuntime { /// /// This variant allows for the call to be substituted with an optimized /// variant. - CodeGen::RValue - GeneratePossiblySpecializedMessageSend(CodeGenFunction &CGF, - ReturnValueSlot Return, - QualType ResultType, - Selector Sel, - llvm::Value *Receiver, - const CallArgList& Args, - const ObjCInterfaceDecl *OID, - const ObjCMethodDecl *Method, - bool isClassMessage); + CodeGen::RValue GeneratePossiblySpecializedMessageSend( + CodeGenFunction &CGF, ReturnValueSlot Return, QualType ResultType, + Selector Sel, llvm::Value *Receiver, const CallArgList &Args, + const ObjCInterfaceDecl *OID, const ObjCMethodDecl *Method, + bool isClassMessage); /// Generate an Objective-C message send operation to the super /// class initiated in a method for Class and with the given Self @@ -195,17 +186,11 @@ class CGObjCRuntime { /// /// \param Method - The method being called, this may be null if synthesizing /// a property setter or getter. - virtual CodeGen::RValue - GenerateMessageSendSuper(CodeGen::CodeGenFunction &CGF, - ReturnValueSlot ReturnSlot, - QualType ResultType, - Selector Sel, - const ObjCInterfaceDecl *Class, - bool isCategoryImpl, - llvm::Value *Self, - bool IsClassMessage, - const CallArgList &CallArgs, - const ObjCMethodDecl *Method = nullptr) = 0; + virtual CodeGen::RValue GenerateMessageSendSuper( + CodeGen::CodeGenFunction &CGF, ReturnValueSlot ReturnSlot, + QualType ResultType, Selector Sel, const ObjCInterfaceDecl *Class, + bool isCategoryImpl, llvm::Value *Self, bool IsClassMessage, + const CallArgList &CallArgs, const ObjCMethodDecl *Method = nullptr) = 0; /// Walk the list of protocol references from a class, category or /// protocol to traverse the DAG formed from it's inheritance hierarchy. Find @@ -272,7 +257,6 @@ class CGObjCRuntime { virtual llvm::Value *GetClass(CodeGenFunction &CGF, const ObjCInterfaceDecl *OID) = 0; - virtual llvm::Value *EmitNSAutoreleasePoolClassRef(CodeGenFunction &CGF) { llvm_unreachable("autoreleasepool unsupported in this ABI"); } @@ -287,14 +271,14 @@ class CGObjCRuntime { const ObjCAtTryStmt &S) = 0; virtual void EmitThrowStmt(CodeGen::CodeGenFunction &CGF, const ObjCAtThrowStmt &S, - bool ClearInsertionPoint=true) = 0; + bool ClearInsertionPoint = true) = 0; virtual llvm::Value *EmitObjCWeakRead(CodeGen::CodeGenFunction &CGF, Address AddrWeakObj) = 0; virtual void EmitObjCWeakAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, Address dest) = 0; virtual void EmitObjCGlobalAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, Address dest, - bool threadlocal=false) = 0; + bool threadlocal = false) = 0; virtual void EmitObjCIvarAssign(CodeGen::CodeGenFunction &CGF, llvm::Value *src, Address dest, llvm::Value *ivarOffset) = 0; @@ -302,21 +286,21 @@ class CGObjCRuntime { llvm::Value *src, Address dest) = 0; virtual LValue EmitObjCValueForIvar(CodeGen::CodeGenFunction &CGF, - QualType ObjectTy, - llvm::Value *BaseValue, + QualType ObjectTy, llvm::Value *BaseValue, const ObjCIvarDecl *Ivar, unsigned CVRQualifiers) = 0; virtual llvm::Value *EmitIvarOffset(CodeGen::CodeGenFunction &CGF, const ObjCInterfaceDecl *Interface, const ObjCIvarDecl *Ivar) = 0; virtual void EmitGCMemmoveCollectable(CodeGen::CodeGenFunction &CGF, - Address DestPtr, - Address SrcPtr, + Address DestPtr, Address SrcPtr, llvm::Value *Size) = 0; - virtual llvm::Constant *BuildGCBlockLayout(CodeGen::CodeGenModule &CGM, - const CodeGen::CGBlockInfo &blockInfo) = 0; - virtual llvm::Constant *BuildRCBlockLayout(CodeGen::CodeGenModule &CGM, - const CodeGen::CGBlockInfo &blockInfo) = 0; + virtual llvm::Constant * + BuildGCBlockLayout(CodeGen::CodeGenModule &CGM, + const CodeGen::CGBlockInfo &blockInfo) = 0; + virtual llvm::Constant * + BuildRCBlockLayout(CodeGen::CodeGenModule &CGM, + const CodeGen::CGBlockInfo &blockInfo) = 0; virtual std::string getRCBlockLayoutStr(CodeGen::CodeGenModule &CGM, const CGBlockInfo &blockInfo) { return {}; @@ -332,15 +316,14 @@ class CGObjCRuntime { MessageSendInfo(const CGFunctionInfo &callInfo, llvm::PointerType *messengerType) - : CallInfo(callInfo), MessengerType(messengerType) {} + : CallInfo(callInfo), MessengerType(messengerType) {} }; MessageSendInfo getMessageSendInfo(const ObjCMethodDecl *method, QualType resultType, CallArgList &callArgs); bool canMessageReceiverBeNull(CodeGenFunction &CGF, - const ObjCMethodDecl *method, - bool isSuper, + const ObjCMethodDecl *method, bool isSuper, const ObjCInterfaceDecl *classReceiver, llvm::Value *receiver); static bool isWeakLinkedClass(const ObjCInterfaceDecl *cls); @@ -364,9 +347,9 @@ class CGObjCRuntime { }; /// Creates an instance of an Objective-C runtime class. -//TODO: This should include some way of selecting which runtime to target. +// TODO: This should include some way of selecting which runtime to target. CGObjCRuntime *CreateGNUObjCRuntime(CodeGenModule &CGM); CGObjCRuntime *CreateMacObjCRuntime(CodeGenModule &CGM); -} -} +} // namespace CodeGen +} // namespace clang #endif diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index cafaaa364cb76..b679d63874b3b 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1457,14 +1457,13 @@ void CGOpenMPRuntime::functionFinished(CodeGenFunction &CGF) { clearLocThreadIdInsertPt(CGF); OpenMPLocThreadIDMap.erase(CGF.CurFn); } - if (FunctionUDRMap.count(CGF.CurFn) > 0) { - for(const auto *D : FunctionUDRMap[CGF.CurFn]) + if (auto I = FunctionUDRMap.find(CGF.CurFn); I != FunctionUDRMap.end()) { + for (const auto *D : I->second) UDRMap.erase(D); - FunctionUDRMap.erase(CGF.CurFn); + FunctionUDRMap.erase(I); } - auto I = FunctionUDMMap.find(CGF.CurFn); - if (I != FunctionUDMMap.end()) { - for(const auto *D : I->second) + if (auto I = FunctionUDMMap.find(CGF.CurFn); I != FunctionUDMMap.end()) { + for (const auto *D : I->second) UDMMap.erase(D); FunctionUDMMap.erase(I); } diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 8e694b95dc7e7..3542e939678cf 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -2457,10 +2457,86 @@ static void emitSimdlenSafelenClause(CodeGenFunction &CGF, } } +// Check for the presence of an `OMPOrderedDirective`, +// i.e., `ordered` in `#pragma omp ordered simd`. +// +// Consider the following source code: +// ``` +// __attribute__((noinline)) void omp_simd_loop(float X[ARRAY_SIZE][ARRAY_SIZE]) +// { +// for (int r = 1; r < ARRAY_SIZE; ++r) { +// for (int c = 1; c < ARRAY_SIZE; ++c) { +// #pragma omp simd +// for (int k = 2; k < ARRAY_SIZE; ++k) { +// #pragma omp ordered simd +// X[r][k] = X[r][k - 2] + sinf((float)(r / c)); +// } +// } +// } +// } +// ``` +// +// Suppose we are in `CodeGenFunction::EmitOMPSimdInit(const OMPLoopDirective +// &D)`. By examining `D.dump()` we have the following AST containing +// `OMPOrderedDirective`: +// +// ``` +// OMPSimdDirective 0x1c32950 +// `-CapturedStmt 0x1c32028 +// |-CapturedDecl 0x1c310e8 +// | |-ForStmt 0x1c31e30 +// | | |-DeclStmt 0x1c31298 +// | | | `-VarDecl 0x1c31208 used k 'int' cinit +// | | | `-IntegerLiteral 0x1c31278 'int' 2 +// | | |-<<>> +// | | |-BinaryOperator 0x1c31308 'int' '<' +// | | | |-ImplicitCastExpr 0x1c312f0 'int' +// | | | | `-DeclRefExpr 0x1c312b0 'int' lvalue Var 0x1c31208 'k' 'int' +// | | | `-IntegerLiteral 0x1c312d0 'int' 256 +// | | |-UnaryOperator 0x1c31348 'int' prefix '++' +// | | | `-DeclRefExpr 0x1c31328 'int' lvalue Var 0x1c31208 'k' 'int' +// | | `-CompoundStmt 0x1c31e18 +// | | `-OMPOrderedDirective 0x1c31dd8 +// | | |-OMPSimdClause 0x1c31380 +// | | `-CapturedStmt 0x1c31cd0 +// ``` +// +// Note the presence of `OMPOrderedDirective` above: +// It's (transitively) nested in a `CapturedStmt` representing the pragma +// annotated compound statement. Thus, we need to consider this nesting and +// include checking the `getCapturedStmt` in this case. +static bool hasOrderedDirective(const Stmt *S) { + if (isa(S)) + return true; + + if (const auto *CS = dyn_cast(S)) + return hasOrderedDirective(CS->getCapturedStmt()); + + for (const Stmt *Child : S->children()) { + if (Child && hasOrderedDirective(Child)) + return true; + } + + return false; +} + +static void applyConservativeSimdOrderedDirective(const Stmt &AssociatedStmt, + LoopInfoStack &LoopStack) { + // Check for the presence of an `OMPOrderedDirective` + // i.e., `ordered` in `#pragma omp ordered simd` + bool HasOrderedDirective = hasOrderedDirective(&AssociatedStmt); + // If present then conservatively disable loop vectorization + // analogously to how `emitSimdlenSafelenClause` does. + if (HasOrderedDirective) + LoopStack.setParallel(/*Enable=*/false); +} + void CodeGenFunction::EmitOMPSimdInit(const OMPLoopDirective &D) { // Walk clauses and process safelen/lastprivate. LoopStack.setParallel(/*Enable=*/true); LoopStack.setVectorizeEnable(); + const Stmt *AssociatedStmt = D.getAssociatedStmt(); + applyConservativeSimdOrderedDirective(*AssociatedStmt, LoopStack); emitSimdlenSafelenClause(*this, D); if (const auto *C = D.getSingleClause()) if (C->getKind() == OMPC_ORDER_concurrent) diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index e7a5100a9fa29..f3af9b4ace441 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -54,7 +54,7 @@ class SwitchInst; class Twine; class Value; class CanonicalLoopInfo; -} +} // namespace llvm namespace clang { class ASTContext; @@ -102,6 +102,7 @@ class TargetCodeGenInfo; struct OMPTaskDataTy; struct CGCoroData; +// clang-format off /// The kind of evaluation to perform on values of a particular /// type. Basically, is the code in CGExprScalar, CGExprComplex, or /// CGExprAgg? @@ -112,6 +113,7 @@ enum TypeEvaluationKind { TEK_Complex, TEK_Aggregate }; +// clang-format on #define LIST_SANITIZER_CHECKS \ SANITIZER_CHECK(AddOverflow, add_overflow, 0) \ @@ -150,7 +152,7 @@ enum SanitizerHandler { /// Helper class with most of the code for saving a value for a /// conditional expression cleanup. struct DominatingLLVMValue { - typedef llvm::PointerIntPair saved_type; + typedef llvm::PointerIntPair saved_type; /// Answer whether the given value needs extra work to be saved. static bool needsSaving(llvm::Value *value) { @@ -158,7 +160,8 @@ struct DominatingLLVMValue { return false; // If it's not an instruction, we don't need to save. - if (!isa(value)) return false; + if (!isa(value)) + return false; // If it's an instruction in the entry block, we don't need to save. llvm::BasicBlock *block = cast(value)->getParent(); @@ -171,10 +174,10 @@ struct DominatingLLVMValue { /// A partial specialization of DominatingValue for llvm::Values that /// might be llvm::Instructions. -template struct DominatingPointer : DominatingLLVMValue { +template struct DominatingPointer : DominatingLLVMValue { typedef T *type; static type restore(CodeGenFunction &CGF, saved_type value) { - return static_cast(DominatingLLVMValue::restore(CGF, value)); + return static_cast(DominatingLLVMValue::restore(CGF, value)); } }; @@ -212,8 +215,13 @@ template <> struct DominatingValue
{ template <> struct DominatingValue { typedef RValue type; class saved_type { - enum Kind { ScalarLiteral, ScalarAddress, AggregateLiteral, - AggregateAddress, ComplexAddress }; + enum Kind { + ScalarLiteral, + ScalarAddress, + AggregateLiteral, + AggregateAddress, + ComplexAddress + }; union { struct { DominatingLLVMValue::saved_type first, second; @@ -241,9 +249,7 @@ template <> struct DominatingValue { // implementations in CGCleanup.cpp }; - static bool needsSaving(type value) { - return saved_type::needsSaving(value); - } + static bool needsSaving(type value) { return saved_type::needsSaving(value); } static saved_type save(CodeGenFunction &CGF, type value) { return saved_type::save(CGF, value); } @@ -259,6 +265,7 @@ class CodeGenFunction : public CodeGenTypeCache { void operator=(const CodeGenFunction &) = delete; friend class CGCXXABI; + public: /// A jump destination is an abstract label, branching to which may /// require a jump out through normal cleanups. @@ -284,7 +291,7 @@ class CodeGenFunction : public CodeGenTypeCache { unsigned Index; }; - CodeGenModule &CGM; // Per-module state. + CodeGenModule &CGM; // Per-module state. const TargetInfo &Target; // For EH/SEH outlined funclets, this field points to parent's CGF @@ -369,9 +376,7 @@ class CodeGenFunction : public CodeGenTypeCache { }; CGCoroInfo CurCoro; - bool isCoroutine() const { - return CurCoro.Data != nullptr; - } + bool isCoroutine() const { return CurCoro.Data != nullptr; } bool inSuspendBlock() const { return isCoroutine() && CurCoro.InSuspendBlock; @@ -464,10 +469,10 @@ class CodeGenFunction : public CodeGenTypeCache { : Kind(K), ThisValue(nullptr), CXXThisFieldDecl(nullptr) {} explicit CGCapturedStmtInfo(const CapturedStmt &S, CapturedRegionKind K = CR_Default) - : Kind(K), ThisValue(nullptr), CXXThisFieldDecl(nullptr) { + : Kind(K), ThisValue(nullptr), CXXThisFieldDecl(nullptr) { RecordDecl::field_iterator Field = - S.getCapturedRecordDecl()->field_begin(); + S.getCapturedRecordDecl()->field_begin(); for (CapturedStmt::const_capture_iterator I = S.capture_begin(), E = S.capture_end(); I != E; ++I, ++Field) { @@ -496,9 +501,7 @@ class CodeGenFunction : public CodeGenTypeCache { bool isCXXThisExprCaptured() const { return getThisFieldDecl() != nullptr; } virtual FieldDecl *getThisFieldDecl() const { return CXXThisFieldDecl; } - static bool classof(const CGCapturedStmtInfo *) { - return true; - } + static bool classof(const CGCapturedStmtInfo *) { return true; } /// Emit the captured statement body. virtual void EmitBody(CodeGenFunction &CGF, const Stmt *S) { @@ -535,6 +538,7 @@ class CodeGenFunction : public CodeGenTypeCache { private: CodeGenFunction &CGF; CGCapturedStmtInfo *PrevCapturedStmtInfo; + public: CGCapturedStmtRAII(CodeGenFunction &CGF, CGCapturedStmtInfo *NewCapturedStmtInfo) @@ -578,6 +582,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// RAII object to set/unset CodeGenFunction::IsSanitizerScope. class SanitizerScope { CodeGenFunction *CGF; + public: SanitizerScope(CodeGenFunction *CGF); ~SanitizerScope(); @@ -832,7 +837,7 @@ class CodeGenFunction : public CodeGenTypeCache { public: /// ObjCEHValueStack - Stack of Objective-C exception values, used for /// rethrows. - SmallVector ObjCEHValueStack; + SmallVector ObjCEHValueStack; /// A class controlling the emission of a finally block. class FinallyInfo { @@ -900,7 +905,8 @@ class CodeGenFunction : public CodeGenTypeCache { SavedTuple Saved{saveValueInCond(A)...}; typedef EHScopeStack::ConditionalCleanup CleanupType; - pushCleanupAfterFullExprWithActiveFlag(Kind, ActiveFlag, Saved); + pushCleanupAfterFullExprWithActiveFlag(Kind, ActiveFlag, + Saved); } template @@ -990,15 +996,16 @@ class CodeGenFunction : public CodeGenTypeCache { size_t LifetimeExtendedCleanupStackSize; CleanupDeactivationScope DeactivateCleanups; bool OldDidCallStackSave; + protected: bool PerformCleanup; - private: + private: RunCleanupsScope(const RunCleanupsScope &) = delete; void operator=(const RunCleanupsScope &) = delete; protected: - CodeGenFunction& CGF; + CodeGenFunction &CGF; public: /// Enter a new cleanup scope. @@ -1030,7 +1037,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// the insertion point after cleanup emission. If cleanup emission created /// a shared cleanup block, these value pointers will be rewritten. /// Otherwise, they not will be modified. - void ForceCleanup(std::initializer_list ValuesToReload = {}) { + void + ForceCleanup(std::initializer_list ValuesToReload = {}) { assert(PerformCleanup && "Already forced cleanup"); CGF.DidCallStackSave = OldDidCallStackSave; DeactivateCleanups.ForceDeactivate(); @@ -1047,7 +1055,7 @@ class CodeGenFunction : public CodeGenTypeCache { class LexicalScope : public RunCleanupsScope { SourceRange Range; - SmallVector Labels; + SmallVector Labels; LexicalScope *ParentScope; LexicalScope(const LexicalScope &) = delete; @@ -1056,7 +1064,8 @@ class CodeGenFunction : public CodeGenTypeCache { public: /// Enter a new cleanup scope. explicit LexicalScope(CodeGenFunction &CGF, SourceRange Range) - : RunCleanupsScope(CGF), Range(Range), ParentScope(CGF.CurLexicalScope) { + : RunCleanupsScope(CGF), Range(Range), + ParentScope(CGF.CurLexicalScope) { CGF.CurLexicalScope = this; if (CGDebugInfo *DI = CGF.getDebugInfo()) DI->EmitLexicalBlockStart(CGF.Builder, Range.getBegin()); @@ -1091,9 +1100,7 @@ class CodeGenFunction : public CodeGenTypeCache { rescopeLabels(); } - bool hasLabels() const { - return !Labels.empty(); - } + bool hasLabels() const { return !Labels.empty(); } void rescopeLabels(); }; @@ -1120,7 +1127,8 @@ class CodeGenFunction : public CodeGenTypeCache { Address TempAddr) { LocalVD = LocalVD->getCanonicalDecl(); // Only save it once. - if (SavedLocals.count(LocalVD)) return false; + if (SavedLocals.count(LocalVD)) + return false; // Copy the existing local entry to SavedLocals. auto it = CGF.LocalDeclMap.find(LocalVD); @@ -1259,8 +1267,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// target of a potentially scope-crossing jump; get a stable handle /// to which we can perform this jump later. JumpDest getJumpDestInCurrentScope(llvm::BasicBlock *Target) { - return JumpDest(Target, - EHStack.getInnermostNormalCleanup(), + return JumpDest(Target, EHStack.getInnermostNormalCleanup(), NextCleanupDestIndex++); } @@ -1297,7 +1304,7 @@ class CodeGenFunction : public CodeGenTypeCache { public: ConditionalEvaluation(CodeGenFunction &CGF) - : StartBB(CGF.Builder.GetInsertBlock()) {} + : StartBB(CGF.Builder.GetInsertBlock()) {} void begin(CodeGenFunction &CGF) { assert(CGF.OutermostConditional != this); @@ -1313,9 +1320,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// Returns a block which will be executed prior to each /// evaluation of the conditional code. - llvm::BasicBlock *getStartingBlock() const { - return StartBB; - } + llvm::BasicBlock *getStartingBlock() const { return StartBB; } }; /// isInConditionalBranch - Return true if we're currently emitting @@ -1343,7 +1348,7 @@ class CodeGenFunction : public CodeGenTypeCache { public: StmtExprEvaluation(CodeGenFunction &CGF) - : CGF(CGF), SavedOutermostConditional(CGF.OutermostConditional) { + : CGF(CGF), SavedOutermostConditional(CGF.OutermostConditional) { CGF.OutermostConditional = nullptr; } @@ -1375,9 +1380,9 @@ class CodeGenFunction : public CodeGenTypeCache { bool BoundLValue; CodeGenFunction::PeepholeProtection Protection; - OpaqueValueMappingData(const OpaqueValueExpr *ov, - bool boundLValue) - : OpaqueValue(ov), BoundLValue(boundLValue) {} + OpaqueValueMappingData(const OpaqueValueExpr *ov, bool boundLValue) + : OpaqueValue(ov), BoundLValue(boundLValue) {} + public: OpaqueValueMappingData() : OpaqueValue(nullptr) {} @@ -1387,30 +1392,26 @@ class CodeGenFunction : public CodeGenTypeCache { // always keeps them in memory. Expressions of function type // act exactly like l-values but are formally required to be // r-values in C. - return expr->isGLValue() || - expr->getType()->isFunctionType() || + return expr->isGLValue() || expr->getType()->isFunctionType() || hasAggregateEvaluationKind(expr->getType()); } - static OpaqueValueMappingData bind(CodeGenFunction &CGF, - const OpaqueValueExpr *ov, - const Expr *e) { + static OpaqueValueMappingData + bind(CodeGenFunction &CGF, const OpaqueValueExpr *ov, const Expr *e) { if (shouldBindAsLValue(ov)) return bind(CGF, ov, CGF.EmitLValue(e)); return bind(CGF, ov, CGF.EmitAnyExpr(e)); } - static OpaqueValueMappingData bind(CodeGenFunction &CGF, - const OpaqueValueExpr *ov, - const LValue &lv) { + static OpaqueValueMappingData + bind(CodeGenFunction &CGF, const OpaqueValueExpr *ov, const LValue &lv) { assert(shouldBindAsLValue(ov)); CGF.OpaqueLValues.insert(std::make_pair(ov, lv)); return OpaqueValueMappingData(ov, true); } - static OpaqueValueMappingData bind(CodeGenFunction &CGF, - const OpaqueValueExpr *ov, - const RValue &rv) { + static OpaqueValueMappingData + bind(CodeGenFunction &CGF, const OpaqueValueExpr *ov, const RValue &rv) { assert(!shouldBindAsLValue(ov)); CGF.OpaqueRValues.insert(std::make_pair(ov, rv)); @@ -1455,7 +1456,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// helpful. /// OpaqueValueMapping(CodeGenFunction &CGF, - const AbstractConditionalOperator *op) : CGF(CGF) { + const AbstractConditionalOperator *op) + : CGF(CGF) { if (isa(op)) // Leave Data empty. return; @@ -1476,17 +1478,15 @@ class CodeGenFunction : public CodeGenTypeCache { } } - OpaqueValueMapping(CodeGenFunction &CGF, - const OpaqueValueExpr *opaqueValue, + OpaqueValueMapping(CodeGenFunction &CGF, const OpaqueValueExpr *opaqueValue, LValue lvalue) - : CGF(CGF), Data(OpaqueValueMappingData::bind(CGF, opaqueValue, lvalue)) { - } + : CGF(CGF), + Data(OpaqueValueMappingData::bind(CGF, opaqueValue, lvalue)) {} - OpaqueValueMapping(CodeGenFunction &CGF, - const OpaqueValueExpr *opaqueValue, + OpaqueValueMapping(CodeGenFunction &CGF, const OpaqueValueExpr *opaqueValue, RValue rvalue) - : CGF(CGF), Data(OpaqueValueMappingData::bind(CGF, opaqueValue, rvalue)) { - } + : CGF(CGF), + Data(OpaqueValueMappingData::bind(CGF, opaqueValue, rvalue)) {} void pop() { Data.unbind(CGF); @@ -1494,7 +1494,8 @@ class CodeGenFunction : public CodeGenTypeCache { } ~OpaqueValueMapping() { - if (Data.isValid()) Data.unbind(CGF); + if (Data.isValid()) + Data.unbind(CGF); } }; @@ -1534,13 +1535,13 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::DenseMap EscapedLocals; /// LabelMap - This keeps track of the LLVM basic block for each C label. - llvm::DenseMap LabelMap; + llvm::DenseMap LabelMap; // BreakContinueStack - This keeps track of where break and continue // statements should jump to. struct BreakContinue { BreakContinue(JumpDest Break, JumpDest Continue) - : BreakBlock(Break), ContinueBlock(Continue) {} + : BreakBlock(Break), ContinueBlock(Continue) {} JumpDest BreakBlock; JumpDest ContinueBlock; @@ -1716,12 +1717,9 @@ class CodeGenFunction : public CodeGenTypeCache { /// Get the profiler's current count. This is generally the count for the most /// recently incremented counter. - uint64_t getCurrentProfileCount() { - return PGO.getCurrentRegionCount(); - } + uint64_t getCurrentProfileCount() { return PGO.getCurrentRegionCount(); } private: - /// SwitchInsn - This is nearest current switch instruction. It is null if /// current context is not in a switch. llvm::SwitchInst *SwitchInsn = nullptr; @@ -1746,7 +1744,7 @@ class CodeGenFunction : public CodeGenTypeCache { // multiple VLA types can share the same size expression. // FIXME: Maybe this could be a stack of maps that is pushed/popped as we // enter/leave scopes. - llvm::DenseMap VLASizeMap; + llvm::DenseMap VLASizeMap; /// A block containing a single 'unreachable' instruction. Created /// lazily by getUnreachableBlock(). @@ -1788,7 +1786,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// The scope of a CXXDefaultInitExpr. Within this scope, the value of 'this' /// is overridden to be the object under construction. - class CXXDefaultInitExprScope { + class CXXDefaultInitExprScope { public: CXXDefaultInitExprScope(CodeGenFunction &CGF, const CXXDefaultInitExpr *E) : CGF(CGF), OldCXXThisValue(CGF.CXXThisValue), @@ -1819,12 +1817,10 @@ class CodeGenFunction : public CodeGenTypeCache { class ArrayInitLoopExprScope { public: ArrayInitLoopExprScope(CodeGenFunction &CGF, llvm::Value *Index) - : CGF(CGF), OldArrayInitIndex(CGF.ArrayInitIndex) { + : CGF(CGF), OldArrayInitIndex(CGF.ArrayInitIndex) { CGF.ArrayInitIndex = Index; } - ~ArrayInitLoopExprScope() { - CGF.ArrayInitIndex = OldArrayInitIndex; - } + ~ArrayInitLoopExprScope() { CGF.ArrayInitIndex = OldArrayInitIndex; } private: CodeGenFunction &CGF; @@ -2151,7 +2147,7 @@ class CodeGenFunction : public CodeGenTypeCache { void EmitKernelMetadata(const FunctionDecl *FD, llvm::Function *Fn); public: - CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext=false); + CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext = false); ~CodeGenFunction(); CodeGenTypes &getTypes() const { return CGM.getTypes(); } @@ -2191,7 +2187,8 @@ class CodeGenFunction : public CodeGenTypeCache { } llvm::BasicBlock *getInvokeDest() { - if (!EHStack.requiresLandingPad()) return nullptr; + if (!EHStack.requiresLandingPad()) + return nullptr; return getInvokeDestImpl(); } @@ -2220,10 +2217,10 @@ class CodeGenFunction : public CodeGenTypeCache { CharUnits elementAlignment, Destroyer *destroyer); - void pushDestroy(QualType::DestructionKind dtorKind, - Address addr, QualType type); - void pushEHDestroy(QualType::DestructionKind dtorKind, - Address addr, QualType type); + void pushDestroy(QualType::DestructionKind dtorKind, Address addr, + QualType type); + void pushEHDestroy(QualType::DestructionKind dtorKind, Address addr, + QualType type); void pushDestroy(CleanupKind kind, Address addr, QualType type, Destroyer *destroyer, bool useEHCleanupForArray); void pushDestroyAndDeferDeactivation(QualType::DestructionKind dtorKind, @@ -2248,8 +2245,8 @@ class CodeGenFunction : public CodeGenTypeCache { const VarDecl *VD); void emitArrayDestroy(llvm::Value *begin, llvm::Value *end, QualType elementType, CharUnits elementAlign, - Destroyer *destroyer, - bool checkZeroLength, bool useEHCleanup); + Destroyer *destroyer, bool checkZeroLength, + bool useEHCleanup); Destroyer *getDestroyer(QualType::DestructionKind destructionKind); @@ -2311,8 +2308,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// captured variables, etc. llvm::Value *EmitBlockLiteral(const BlockExpr *); - llvm::Function *GenerateBlockFunction(GlobalDecl GD, - const CGBlockInfo &Info, + llvm::Function *GenerateBlockFunction(GlobalDecl GD, const CGBlockInfo &Info, const DeclMapTy &ldm, bool IsLambdaConversionToBlock, bool BuildGlobalBlock); @@ -2322,10 +2318,10 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Constant *GenerateCopyHelperFunction(const CGBlockInfo &blockInfo); llvm::Constant *GenerateDestroyHelperFunction(const CGBlockInfo &blockInfo); - llvm::Constant *GenerateObjCAtomicSetterCopyHelperFunction( - const ObjCPropertyImplDecl *PID); - llvm::Constant *GenerateObjCAtomicGetterCopyHelperFunction( - const ObjCPropertyImplDecl *PID); + llvm::Constant * + GenerateObjCAtomicSetterCopyHelperFunction(const ObjCPropertyImplDecl *PID); + llvm::Constant * + GenerateObjCAtomicGetterCopyHelperFunction(const ObjCPropertyImplDecl *PID); llvm::Value *EmitBlockCopyAndAutorelease(llvm::Value *Block, QualType Ty); void BuildBlockRelease(llvm::Value *DeclPtr, BlockFieldFlags flags, @@ -2364,10 +2360,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// data in a variable which is declared as __block. Address emitBlockByrefAddress(Address baseAddr, const VarDecl *V, bool followForward = true); - Address emitBlockByrefAddress(Address baseAddr, - const BlockByrefInfo &info, - bool followForward, - const llvm::Twine &name); + Address emitBlockByrefAddress(Address baseAddr, const BlockByrefInfo &info, + bool followForward, const llvm::Twine &name); const BlockByrefInfo &getBlockByrefInfo(const VarDecl *var); @@ -2383,11 +2377,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// Emit code for the start of a function. /// \param Loc The location to be associated with the function. /// \param StartLoc The location of the function body. - void StartFunction(GlobalDecl GD, - QualType RetTy, - llvm::Function *Fn, - const CGFunctionInfo &FnInfo, - const FunctionArgList &Args, + void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, + const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc = SourceLocation(), SourceLocation StartLoc = SourceLocation()); @@ -2424,7 +2415,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// FinishFunction - Complete IR generation of the current function. It is /// legal to call this function even if there is no current insertion point. - void FinishFunction(SourceLocation EndLoc=SourceLocation()); + void FinishFunction(SourceLocation EndLoc = SourceLocation()); void StartThunk(llvm::Function *Fn, GlobalDecl GD, const CGFunctionInfo &FnInfo, bool IsUnprototyped); @@ -2567,8 +2558,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// EmitFunctionProlog - Emit the target specific LLVM code to load the /// arguments for the given function. This is also responsible for naming the /// LLVM function arguments. - void EmitFunctionProlog(const CGFunctionInfo &FI, - llvm::Function *Fn, + void EmitFunctionProlog(const CGFunctionInfo &FI, llvm::Function *Fn, const FunctionArgList &Args); /// EmitFunctionEpilog - Emit the target specific LLVM code to return the @@ -2647,7 +2637,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// IsFinished - If true, indicates that the caller has finished emitting /// branches to the given block and does not expect to emit code into it. This /// means the block can be ignored if it is unreachable. - void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false); + void EmitBlock(llvm::BasicBlock *BB, bool IsFinished = false); /// EmitBlockAfterUses - Emit the given block somewhere hopefully /// near its uses, and leave the insertion point in it. @@ -2665,9 +2655,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// HaveInsertPoint - True if an insertion point is defined. If not, this /// indicates that the current code being emitted is unreachable. - bool HaveInsertPoint() const { - return Builder.GetInsertBlock() != nullptr; - } + bool HaveInsertPoint() const { return Builder.GetInsertBlock() != nullptr; } /// EnsureInsertPoint - Ensure that an insertion point is defined so that /// emitted IR has a place to go. Note that by definition, if this function @@ -2768,9 +2756,9 @@ class CodeGenFunction : public CodeGenTypeCache { LValueBaseInfo *PointeeBaseInfo = nullptr, TBAAAccessInfo *PointeeTBAAInfo = nullptr); LValue EmitLoadOfReferenceLValue(LValue RefLVal); - LValue EmitLoadOfReferenceLValue(Address RefAddr, QualType RefTy, - AlignmentSource Source = - AlignmentSource::Type) { + LValue + EmitLoadOfReferenceLValue(Address RefAddr, QualType RefTy, + AlignmentSource Source = AlignmentSource::Type) { LValue RefLVal = MakeAddrLValue(RefAddr, RefTy, LValueBaseInfo(Source), CGM.getTBAAAccessInfo(RefTy)); return EmitLoadOfReferenceLValue(RefLVal); @@ -2914,7 +2902,8 @@ class CodeGenFunction : public CodeGenTypeCache { const CGBitFieldInfo &Info, SourceLocation Loc); - /// EmitIgnoredExpr - Emit an expression in a context which ignores the result. + /// EmitIgnoredExpr - Emit an expression in a context which ignores the + /// result. void EmitIgnoredExpr(const Expr *E); /// EmitAnyExpr - Emit code to compute the specified expression which can have @@ -2942,8 +2931,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// EmitAnyExprToMem - Emits the code necessary to evaluate an /// arbitrary expression into the given memory location. - void EmitAnyExprToMem(const Expr *E, Address Location, - Qualifiers Quals, bool IsInitializer); + void EmitAnyExprToMem(const Expr *E, Address Location, Qualifiers Quals, + bool IsInitializer); void EmitAnyExprToExn(const Expr *E, Address Addr); @@ -3061,8 +3050,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// emitArrayLength - Compute the length of an array, even if it's a /// VLA, and drill down to the base element type. - llvm::Value *emitArrayLength(const ArrayType *arrayType, - QualType &baseType, + llvm::Value *emitArrayLength(const ArrayType *arrayType, QualType &baseType, Address &addr); /// EmitVLASize - Capture all the sizes for the VLA expressions in @@ -3109,24 +3097,21 @@ class CodeGenFunction : public CodeGenTypeCache { /// GetAddressOfBaseOfCompleteClass - Convert the given pointer to a /// complete class to the given direct base. - Address - GetAddressOfDirectBaseInCompleteClass(Address Value, - const CXXRecordDecl *Derived, - const CXXRecordDecl *Base, - bool BaseIsVirtual); + Address GetAddressOfDirectBaseInCompleteClass(Address Value, + const CXXRecordDecl *Derived, + const CXXRecordDecl *Base, + bool BaseIsVirtual); static bool ShouldNullCheckClassCastValue(const CastExpr *Cast); /// GetAddressOfBaseClass - This function will add the necessary delta to the /// load of 'this' and returns address of the base class. - Address GetAddressOfBaseClass(Address Value, - const CXXRecordDecl *Derived, + Address GetAddressOfBaseClass(Address Value, const CXXRecordDecl *Derived, CastExpr::path_const_iterator PathBegin, CastExpr::path_const_iterator PathEnd, bool NullCheckValue, SourceLocation Loc); - Address GetAddressOfDerivedClass(Address Value, - const CXXRecordDecl *Derived, + Address GetAddressOfDerivedClass(Address Value, const CXXRecordDecl *Derived, CastExpr::path_const_iterator PathBegin, CastExpr::path_const_iterator PathEnd, bool NullCheckValue); @@ -3185,20 +3170,17 @@ class CodeGenFunction : public CodeGenTypeCache { /// Emit assumption that vptr load == global vtable. void EmitVTableAssumptionLoad(const VPtr &vptr, Address This); - void EmitSynthesizedCXXCopyCtorCall(const CXXConstructorDecl *D, - Address This, Address Src, - const CXXConstructExpr *E); + void EmitSynthesizedCXXCopyCtorCall(const CXXConstructorDecl *D, Address This, + Address Src, const CXXConstructExpr *E); void EmitCXXAggrConstructorCall(const CXXConstructorDecl *D, - const ArrayType *ArrayTy, - Address ArrayPtr, + const ArrayType *ArrayTy, Address ArrayPtr, const CXXConstructExpr *E, bool NewPointerIsChecked, bool ZeroInitialization = false); void EmitCXXAggrConstructorCall(const CXXConstructorDecl *D, - llvm::Value *NumElements, - Address ArrayPtr, + llvm::Value *NumElements, Address ArrayPtr, const CXXConstructExpr *E, bool NewPointerIsChecked, bool ZeroInitialization = false); @@ -3342,7 +3324,6 @@ class CodeGenFunction : public CodeGenTypeCache { /// Get the record field index as represented in debug info. unsigned getDebugInfoFIndex(const RecordDecl *Rec, unsigned FieldIndex); - //===--------------------------------------------------------------------===// // Declaration Emission //===--------------------------------------------------------------------===// @@ -3426,9 +3407,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// Returns the raw, allocated address, which is not necessarily /// the address of the object itself. It is casted to default /// address space for address space agnostic languages. - Address getAllocatedAddress() const { - return Addr; - } + Address getAllocatedAddress() const { return Addr; } /// Returns the address for the original alloca instruction. RawAddress getOriginalAllocatedAddress() const { return AllocaAddr; } @@ -3437,7 +3416,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// Note that this does not chase the forwarding pointer for /// __block decls. Address getObjectAddress(CodeGenFunction &CGF) const { - if (!IsEscapingByRef) return Addr; + if (!IsEscapingByRef) + return Addr; return CGF.emitBlockByrefAddress(Addr, Variable, /*forward*/ false); } @@ -3453,8 +3433,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// QualTypes and size expression's debug node, so that CGDebugInfo can /// reference this node when creating the DISubrange object to describe the /// array types. - void EmitAndRegisterVariableArrayDimensions(CGDebugInfo *DI, - const VarDecl &D, + void EmitAndRegisterVariableArrayDimensions(CGDebugInfo *DI, const VarDecl &D, bool EmitDebugInfo); void EmitStaticVarDecl(const VarDecl &D, @@ -3557,10 +3536,9 @@ class CodeGenFunction : public CodeGenTypeCache { Address EmitCompoundStmt(const CompoundStmt &S, bool GetLast = false, AggValueSlot AVS = AggValueSlot::ignored()); - Address EmitCompoundStmtWithoutScope(const CompoundStmt &S, - bool GetLast = false, - AggValueSlot AVS = - AggValueSlot::ignored()); + Address + EmitCompoundStmtWithoutScope(const CompoundStmt &S, bool GetLast = false, + AggValueSlot AVS = AggValueSlot::ignored()); /// EmitLabel - Emit the block for the given label. It is legal to call this /// function even if there is no current insertion point. @@ -3614,8 +3592,7 @@ class CodeGenFunction : public CodeGenTypeCache { void VolatilizeTryBlocks(llvm::BasicBlock *BB, llvm::SmallPtrSet &V); - void pushSEHCleanup(CleanupKind kind, - llvm::Function *FinallyFunc); + void pushSEHCleanup(CleanupKind kind, llvm::Function *FinallyFunc); void startOutlinedSEHHelper(CodeGenFunction &ParentCGF, bool IsFilter, const Stmt *OutlinedStmt); @@ -3626,8 +3603,7 @@ class CodeGenFunction : public CodeGenTypeCache { const SEHFinallyStmt &Finally); void EmitSEHExceptionCodeSave(CodeGenFunction &ParentCGF, - llvm::Value *ParentFP, - llvm::Value *EntryEBP); + llvm::Value *ParentFP, llvm::Value *EntryEBP); llvm::Value *EmitSEHExceptionCode(); llvm::Value *EmitSEHExceptionInfo(); llvm::Value *EmitSEHAbnormalTermination(); @@ -3647,8 +3623,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// outlined functions. ParentFP is the frame pointer of the outermost parent /// frame. Address recoverAddrOfEscapedLocal(CodeGenFunction &ParentCGF, - Address ParentVar, - llvm::Value *ParentFP); + Address ParentVar, llvm::Value *ParentFP); void EmitCXXForRangeStmt(const CXXForRangeStmt &S, ArrayRef Attrs = {}); @@ -3701,8 +3676,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// the base array element). /// \param Copy Actual copygin expression for copying data from \a SrcVD to \a /// DestVD. - void EmitOMPCopy(QualType OriginalType, - Address DestAddr, Address SrcAddr, + void EmitOMPCopy(QualType OriginalType, Address DestAddr, Address SrcAddr, const VarDecl *DestVD, const VarDecl *SrcVD, const Expr *Copy); /// Emit atomic update code for constructs: \a X = \a X \a BO \a E or @@ -3828,10 +3802,8 @@ class CodeGenFunction : public CodeGenTypeCache { void EmitOMPTargetTaskBasedDirective(const OMPExecutableDirective &S, const RegionCodeGenTy &BodyGen, OMPTargetDataInfo &InputInfo); - void processInReduction(const OMPExecutableDirective &S, - OMPTaskDataTy &Data, - CodeGenFunction &CGF, - const CapturedStmt *CS, + void processInReduction(const OMPExecutableDirective &S, OMPTaskDataTy &Data, + CodeGenFunction &CGF, const CapturedStmt *CS, OMPPrivateScope &Scope); void EmitOMPMetaDirective(const OMPMetaDirective &S); void EmitOMPParallelDirective(const OMPParallelDirective &S); @@ -4187,13 +4159,11 @@ class CodeGenFunction : public CodeGenTypeCache { /// EmitUnsupportedRValue - Emit a dummy r-value using the type of E /// and issue an ErrorUnsupported style diagnostic (using the /// provided Name). - RValue EmitUnsupportedRValue(const Expr *E, - const char *Name); + RValue EmitUnsupportedRValue(const Expr *E, const char *Name); /// EmitUnsupportedLValue - Emit a dummy l-value using the type of E and issue /// an ErrorUnsupported style diagnostic (using the provided Name). - LValue EmitUnsupportedLValue(const Expr *E, - const char *Name); + LValue EmitUnsupportedLValue(const Expr *E, const char *Name); /// EmitLValue - Emit code to compute a designator that specifies the location /// of the expression. @@ -4223,8 +4193,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// that the address will be used to access the object. LValue EmitCheckedLValue(const Expr *E, TypeCheckKind TCK); - RValue convertTempToRValue(Address addr, QualType type, - SourceLocation Loc); + RValue convertTempToRValue(Address addr, QualType type, SourceLocation Loc); void EmitAtomicInit(Expr *E, LValue lvalue); @@ -4302,25 +4271,26 @@ class CodeGenFunction : public CodeGenTypeCache { /// EmitStoreOfScalar - Store a scalar value to an address, taking /// care to appropriately convert from the memory representation to /// the LLVM value representation. - void EmitStoreOfScalar(llvm::Value *Value, Address Addr, - bool Volatile, QualType Ty, + void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, + QualType Ty, AlignmentSource Source = AlignmentSource::Type, bool isInit = false, bool isNontemporal = false) { EmitStoreOfScalar(Value, Addr, Volatile, Ty, LValueBaseInfo(Source), CGM.getTBAAAccessInfo(Ty), isInit, isNontemporal); } - void EmitStoreOfScalar(llvm::Value *Value, Address Addr, - bool Volatile, QualType Ty, - LValueBaseInfo BaseInfo, TBAAAccessInfo TBAAInfo, - bool isInit = false, bool isNontemporal = false); + void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, + QualType Ty, LValueBaseInfo BaseInfo, + TBAAAccessInfo TBAAInfo, bool isInit = false, + bool isNontemporal = false); /// EmitStoreOfScalar - Store a scalar value to an address, taking /// care to appropriately convert from the memory representation to /// the LLVM value representation. The l-value must be a simple /// l-value. The isInit flag indicates whether this is an initialization. /// If so, atomic qualifiers are ignored and the store is always non-atomic. - void EmitStoreOfScalar(llvm::Value *value, LValue lvalue, bool isInit=false); + void EmitStoreOfScalar(llvm::Value *value, LValue lvalue, + bool isInit = false); /// EmitLoadOfLValue - Given an expression that represents a value lvalue, /// this method emits the address of the lvalue, then loads the result as an @@ -4349,7 +4319,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// bit-field contents after the store, appropriate for use as the result of /// an assignment to the bit-field. void EmitStoreThroughBitfieldLValue(RValue Src, LValue Dst, - llvm::Value **Result=nullptr); + llvm::Value **Result = nullptr); /// Emit an l-value for an assignment (simple or compound) of complex type. LValue EmitComplexAssignmentLValue(const BinaryOperator *E); @@ -4402,9 +4372,10 @@ class CodeGenFunction : public CodeGenTypeCache { TBAAAccessInfo *TBAAInfo = nullptr); class ConstantEmission { - llvm::PointerIntPair ValueAndIsReference; + llvm::PointerIntPair ValueAndIsReference; ConstantEmission(llvm::Constant *C, bool isReference) - : ValueAndIsReference(C, isReference) {} + : ValueAndIsReference(C, isReference) {} + public: ConstantEmission() {} static ConstantEmission forReference(llvm::Constant *C) { @@ -4448,7 +4419,7 @@ class CodeGenFunction : public CodeGenTypeCache { const ObjCIvarDecl *Ivar); llvm::Value *EmitIvarOffsetAsPointerDiff(const ObjCInterfaceDecl *Interface, const ObjCIvarDecl *Ivar); - LValue EmitLValueForField(LValue Base, const FieldDecl* Field); + LValue EmitLValueForField(LValue Base, const FieldDecl *Field); LValue EmitLValueForLambdaField(const FieldDecl *Field); LValue EmitLValueForLambdaField(const FieldDecl *Field, llvm::Value *ThisValue); @@ -4456,12 +4427,10 @@ class CodeGenFunction : public CodeGenTypeCache { /// EmitLValueForFieldInitialization - Like EmitLValueForField, except that /// if the Field is a reference, this will return the address of the reference /// and not the address of the value stored in the reference. - LValue EmitLValueForFieldInitialization(LValue Base, - const FieldDecl* Field); + LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field); - LValue EmitLValueForIvar(QualType ObjectTy, - llvm::Value* Base, const ObjCIvarDecl *Ivar, - unsigned CVRQualifiers); + LValue EmitLValueForIvar(QualType ObjectTy, llvm::Value *Base, + const ObjCIvarDecl *Ivar, unsigned CVRQualifiers); LValue EmitCXXConstructLValue(const CXXConstructExpr *E); LValue EmitCXXBindTemporaryLValue(const CXXBindTemporaryExpr *E); @@ -4473,7 +4442,7 @@ class CodeGenFunction : public CodeGenTypeCache { LValue EmitStmtExprLValue(const StmtExpr *E); LValue EmitPointerToDataMemberBinaryExpr(const BinaryOperator *E); LValue EmitObjCSelectorLValue(const ObjCSelectorExpr *E); - void EmitDeclRefExprDbgValue(const DeclRefExpr *E, const APValue &Init); + void EmitDeclRefExprDbgValue(const DeclRefExpr *E, const APValue &Init); //===--------------------------------------------------------------------===// // Scalar Expression Emission @@ -4540,8 +4509,7 @@ class CodeGenFunction : public CodeGenTypeCache { ArrayRef args); CGCallee BuildAppleKextVirtualCall(const CXXMethodDecl *MD, - NestedNameSpecifier *Qual, - llvm::Type *Ty); + NestedNameSpecifier *Qual, llvm::Type *Ty); CGCallee BuildAppleKextVirtualDestructorCall(const CXXDestructorDecl *DD, CXXDtorType Type, @@ -4627,11 +4595,10 @@ class CodeGenFunction : public CodeGenTypeCache { bool HasQualifier, NestedNameSpecifier *Qualifier, bool IsArrow, const Expr *Base, llvm::CallBase **CallOrInvoke); // Compute the object pointer. - Address EmitCXXMemberDataPointerAddress(const Expr *E, Address base, - llvm::Value *memberPtr, - const MemberPointerType *memberPtrType, - LValueBaseInfo *BaseInfo = nullptr, - TBAAAccessInfo *TBAAInfo = nullptr); + Address EmitCXXMemberDataPointerAddress( + const Expr *E, Address base, llvm::Value *memberPtr, + const MemberPointerType *memberPtrType, + LValueBaseInfo *BaseInfo = nullptr, TBAAAccessInfo *TBAAInfo = nullptr); RValue EmitCXXMemberPointerCallExpr(const CXXMemberCallExpr *E, ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke); @@ -4692,22 +4659,18 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitCMSEClearRecord(llvm::Value *V, llvm::ArrayType *ATy, QualType RTy); - llvm::Value *EmitCommonNeonBuiltinExpr(unsigned BuiltinID, - unsigned LLVMIntrinsic, - unsigned AltLLVMIntrinsic, - const char *NameHint, - unsigned Modifier, - const CallExpr *E, - SmallVectorImpl &Ops, - Address PtrOp0, Address PtrOp1, - llvm::Triple::ArchType Arch); + llvm::Value * + EmitCommonNeonBuiltinExpr(unsigned BuiltinID, unsigned LLVMIntrinsic, + unsigned AltLLVMIntrinsic, const char *NameHint, + unsigned Modifier, const CallExpr *E, + SmallVectorImpl &Ops, Address PtrOp0, + Address PtrOp1, llvm::Triple::ArchType Arch); llvm::Function *LookupNeonLLVMIntrinsic(unsigned IntrinsicID, unsigned Modifier, llvm::Type *ArgTy, const CallExpr *E); llvm::Value *EmitNeonCall(llvm::Function *F, - SmallVectorImpl &O, - const char *name, + SmallVectorImpl &O, const char *name, unsigned shift = 0, bool rightshift = false); llvm::Value *EmitFP8NeonCall(unsigned IID, ArrayRef Tys, SmallVectorImpl &O, @@ -4811,7 +4774,7 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Triple::ArchType Arch); llvm::Value *EmitBPFBuiltinExpr(unsigned BuiltinID, const CallExpr *E); - llvm::Value *BuildVector(ArrayRef Ops); + llvm::Value *BuildVector(ArrayRef Ops); llvm::Value *EmitX86BuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitPPCBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E); @@ -4850,8 +4813,9 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitObjCBoxedExpr(const ObjCBoxedExpr *E); llvm::Value *EmitObjCArrayLiteral(const ObjCArrayLiteral *E); llvm::Value *EmitObjCDictionaryLiteral(const ObjCDictionaryLiteral *E); - llvm::Value *EmitObjCCollectionLiteral(const Expr *E, - const ObjCMethodDecl *MethodWithObjects); + llvm::Value * + EmitObjCCollectionLiteral(const Expr *E, + const ObjCMethodDecl *MethodWithObjects); llvm::Value *EmitObjCSelectorExpr(const ObjCSelectorExpr *E); RValue EmitObjCMessageExpr(const ObjCMessageExpr *E, ReturnValueSlot Return = ReturnValueSlot()); @@ -4859,8 +4823,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// Retrieves the default cleanup kind for an ARC cleanup. /// Except under -fobjc-arc-eh, ARC cleanups are normal-only. CleanupKind getARCCleanupKind() { - return CGM.getCodeGenOpts().ObjCAutoRefCountExceptions - ? NormalAndEHCleanup : NormalCleanup; + return CGM.getCodeGenOpts().ObjCAutoRefCountExceptions ? NormalAndEHCleanup + : NormalCleanup; } // ARC primitives. @@ -4895,15 +4859,14 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Type *returnType); void EmitObjCRelease(llvm::Value *value, ARCPreciseLifetime_t precise); - std::pair + std::pair EmitARCStoreAutoreleasing(const BinaryOperator *e); - std::pair - EmitARCStoreStrong(const BinaryOperator *e, bool ignored); - std::pair + std::pair EmitARCStoreStrong(const BinaryOperator *e, + bool ignored); + std::pair EmitARCStoreUnsafeUnretained(const BinaryOperator *e, bool ignored); - llvm::Value *EmitObjCAlloc(llvm::Value *value, - llvm::Type *returnType); + llvm::Value *EmitObjCAlloc(llvm::Value *value, llvm::Type *returnType); llvm::Value *EmitObjCAllocWithZone(llvm::Value *value, llvm::Type *returnType); llvm::Value *EmitObjCAllocInit(llvm::Value *value, llvm::Type *resultType); @@ -4919,7 +4882,7 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitARCRetainAutoreleaseScalarExpr(const Expr *expr); llvm::Value *EmitARCUnsafeUnretainedScalarExpr(const Expr *expr); - void EmitARCIntrinsicUse(ArrayRef values); + void EmitARCIntrinsicUse(ArrayRef values); void EmitARCNoopIntrinsicUse(ArrayRef values); @@ -4946,7 +4909,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// EmitScalarExpr - Emit the computation of the specified expression of LLVM /// scalar type, returning the result. - llvm::Value *EmitScalarExpr(const Expr *E , bool IgnoreResultAssign = false); + llvm::Value *EmitScalarExpr(const Expr *E, bool IgnoreResultAssign = false); /// Emit a conversion from the specified type to the specified destination /// type, both of which are LLVM scalar types. @@ -4986,8 +4949,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// EmitComplexExpr - Emit the computation of the specified expression of /// complex type, returning the result. - ComplexPairTy EmitComplexExpr(const Expr *E, - bool IgnoreReal = false, + ComplexPairTy EmitComplexExpr(const Expr *E, bool IgnoreReal = false, bool IgnoreImag = false); /// EmitComplexExprIntoLValue - Emit the given expression of complex @@ -5003,7 +4965,8 @@ class CodeGenFunction : public CodeGenTypeCache { ComplexPairTy EmitPromotedComplexExpr(const Expr *E, QualType PromotionType); llvm::Value *EmitPromotedScalarExpr(const Expr *E, QualType PromotionType); ComplexPairTy EmitPromotedValue(ComplexPairTy result, QualType PromotionType); - ComplexPairTy EmitUnPromotedValue(ComplexPairTy result, QualType PromotionType); + ComplexPairTy EmitUnPromotedValue(ComplexPairTy result, + QualType PromotionType); Address emitAddrOfRealComponent(Address complex, QualType complexType); Address emitAddrOfImagComponent(Address complex, QualType complexType); @@ -5012,9 +4975,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// global variable that has already been created for it. If the initializer /// has a different type than GV does, this may free GV and return a different /// one. Otherwise it just returns GV. - llvm::GlobalVariable * - AddInitializerToStaticVarDecl(const VarDecl &D, - llvm::GlobalVariable *GV); + llvm::GlobalVariable *AddInitializerToStaticVarDecl(const VarDecl &D, + llvm::GlobalVariable *GV); // Emit an @llvm.invariant.start call for the given memory region. void EmitInvariantStart(llvm::Constant *Addr, CharUnits Size); @@ -5061,8 +5023,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// Emit a branch to select whether or not to perform guarded initialization. void EmitCXXGuardedInitBranch(llvm::Value *NeedsInit, llvm::BasicBlock *InitBlock, - llvm::BasicBlock *NoInitBlock, - GuardKind Kind, const VarDecl *D); + llvm::BasicBlock *NoInitBlock, GuardKind Kind, + const VarDecl *D); /// GenerateCXXGlobalInitFunc - Generates code for initializing global /// variables. @@ -5079,8 +5041,7 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Constant *>> DtorsOrStermFinalizers); - void GenerateCXXGlobalVarDeclInitFunc(llvm::Function *Fn, - const VarDecl *D, + void GenerateCXXGlobalVarDeclInitFunc(llvm::Function *Fn, const VarDecl *D, llvm::GlobalVariable *Addr, bool PerformInit); @@ -5185,8 +5146,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// is a subtraction. llvm::Value *EmitCheckedInBoundsGEP(llvm::Type *ElemTy, llvm::Value *Ptr, ArrayRef IdxList, - bool SignedIndices, - bool IsSubtraction, + bool SignedIndices, bool IsSubtraction, SourceLocation Loc, const Twine &Name = ""); @@ -5369,8 +5329,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// if E is a parameter with the pass_object_size attribute. llvm::Value *emitBuiltinObjectSize(const Expr *E, unsigned Type, llvm::IntegerType *ResType, - llvm::Value *EmittedE, - bool IsDynamic); + llvm::Value *EmittedE, bool IsDynamic); llvm::Value *emitCountedByMemberSize(const Expr *E, llvm::Value *EmittedE, unsigned Type, @@ -5461,7 +5420,7 @@ class CodeGenFunction : public CodeGenTypeCache { void EmitDeclMetadata(); BlockByrefHelpers *buildByrefHelpers(llvm::StructType &byrefType, - const AutoVarEmission &emission); + const AutoVarEmission &emission); void AddObjCARCExceptionMetadata(llvm::Instruction *Inst); @@ -5481,7 +5440,8 @@ class CodeGenFunction : public CodeGenTypeCache { inline DominatingLLVMValue::saved_type DominatingLLVMValue::save(CodeGenFunction &CGF, llvm::Value *value) { - if (!needsSaving(value)) return saved_type(value, false); + if (!needsSaving(value)) + return saved_type(value, false); // Otherwise, we need an alloca. auto align = CharUnits::fromQuantity( @@ -5496,7 +5456,8 @@ DominatingLLVMValue::save(CodeGenFunction &CGF, llvm::Value *value) { inline llvm::Value *DominatingLLVMValue::restore(CodeGenFunction &CGF, saved_type value) { // If the value says it wasn't saved, trust that it's still dominating. - if (!value.getInt()) return value.getPointer(); + if (!value.getInt()) + return value.getPointer(); // Otherwise, it should be an alloca instruction, as set up in save(). auto alloca = cast(value.getPointer()); @@ -5504,12 +5465,12 @@ inline llvm::Value *DominatingLLVMValue::restore(CodeGenFunction &CGF, alloca->getAlign()); } -} // end namespace CodeGen +} // end namespace CodeGen // Map the LangOption for floating point exception behavior into // the corresponding enum in the IR. llvm::fp::ExceptionBehavior ToConstrainedExceptMD(LangOptions::FPExceptionModeKind Kind); -} // end namespace clang +} // end namespace clang #endif diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index 0956296e2d5d8..c6f6fd5b9a7bd 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1067,7 +1067,8 @@ class CodeGenModule : public CodeGenTypeCache { bool shouldEmitRTTI(bool ForEH = false) { return (ForEH || getLangOpts().RTTI) && !getLangOpts().CUDAIsDevice && !(getLangOpts().OpenMP && getLangOpts().OpenMPIsTargetDevice && - (getTriple().isNVPTX() || getTriple().isAMDGPU())); + (getTriple().isNVPTX() || getTriple().isAMDGPU() || + getTriple().isSPIRV())); } /// Get the address of the RTTI descriptor for the given type. diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index b82e4ddb9f3f2..f89d32d4e13fe 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -375,11 +375,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F, if (MinBlocks > 0) { if (MinBlocksVal) *MinBlocksVal = MinBlocks.getExtValue(); - if (F) { - // Create !{, metadata !"minctasm", i32 } node - NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm", - MinBlocks.getExtValue()); - } + if (F) + F->addFnAttr("nvvm.minctasm", llvm::utostr(MinBlocks.getExtValue())); } } if (Attr->getMaxBlocks()) { @@ -388,11 +385,9 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F, if (MaxBlocks > 0) { if (MaxClusterRankVal) *MaxClusterRankVal = MaxBlocks.getExtValue(); - if (F) { - // Create !{, metadata !"maxclusterrank", i32 } node - NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank", - MaxBlocks.getExtValue()); - } + if (F) + F->addFnAttr("nvvm.maxclusterrank", + llvm::utostr(MaxBlocks.getExtValue())); } } } diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index e66e5a32e58ac..8d5cb91ebad9a 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -950,13 +950,7 @@ void ROCMToolChain::addClangTargetOptions( ABIVer)) return; - std::tuple GPUSan( - DriverArgs.hasFlag(options::OPT_fgpu_sanitize, - options::OPT_fno_gpu_sanitize, true), - getSanitizerArgs(DriverArgs)); - bool Wave64 = isWave64(DriverArgs, Kind); - // TODO: There are way too many flags that change this. Do we need to check // them all? bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) || @@ -969,6 +963,12 @@ void ROCMToolChain::addClangTargetOptions( bool CorrectSqrt = DriverArgs.hasArg(options::OPT_cl_fp32_correctly_rounded_divide_sqrt); + // GPU Sanitizer currently only supports ASan and is enabled through host + // ASan. + bool GPUSan = DriverArgs.hasFlag(options::OPT_fgpu_sanitize, + options::OPT_fno_gpu_sanitize, true) && + getSanitizerArgs(DriverArgs).needsAsanRt(); + // Add the OpenCL specific bitcode library. llvm::SmallVector BCLibs; BCLibs.emplace_back(RocmInstallation->getOpenCLPath().str()); @@ -1009,30 +1009,25 @@ llvm::SmallVector RocmInstallationDetector::getCommonBitcodeLibs( const llvm::opt::ArgList &DriverArgs, StringRef LibDeviceFile, bool Wave64, bool DAZ, bool FiniteOnly, bool UnsafeMathOpt, bool FastRelaxedMath, - bool CorrectSqrt, DeviceLibABIVersion ABIVer, - const std::tuple &GPUSan, - bool isOpenMP = false) const { + bool CorrectSqrt, DeviceLibABIVersion ABIVer, bool GPUSan, + bool isOpenMP) const { llvm::SmallVector BCLibs; - auto GPUSanEnabled = [GPUSan]() { return std::get(GPUSan); }; auto AddBCLib = [&](ToolChain::BitCodeLibraryInfo BCLib, bool Internalize = true) { BCLib.ShouldInternalize = Internalize; BCLibs.emplace_back(BCLib); }; auto AddSanBCLibs = [&]() { - if (GPUSanEnabled()) { - auto SanArgs = std::get(GPUSan); - if (SanArgs.needsAsanRt()) - AddBCLib(getAsanRTLPath(), false); - } + if (GPUSan) + AddBCLib(getAsanRTLPath(), false); }; AddSanBCLibs(); AddBCLib(getOCMLPath()); if (!isOpenMP) AddBCLib(getOCKLPath()); - else if (GPUSanEnabled() && isOpenMP) + else if (GPUSan && isOpenMP) AddBCLib(getOCKLPath(), false); AddBCLib(getDenormalsAreZeroPath(DAZ)); AddBCLib(getUnsafeMathPath(UnsafeMathOpt || FastRelaxedMath)); @@ -1064,10 +1059,6 @@ ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs, // If --hip-device-lib is not set, add the default bitcode libraries. // TODO: There are way too many flags that change this. Do we need to check // them all? - std::tuple GPUSan( - DriverArgs.hasFlag(options::OPT_fgpu_sanitize, - options::OPT_fno_gpu_sanitize, false), - getSanitizerArgs(DriverArgs)); bool DAZ = DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero, options::OPT_fno_gpu_flush_denormals_to_zero, getDefaultDenormsAreZeroForTarget(Kind)); @@ -1083,6 +1074,12 @@ ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs, options::OPT_fno_hip_fp32_correctly_rounded_divide_sqrt, true); bool Wave64 = isWave64(DriverArgs, Kind); + // GPU Sanitizer currently only supports ASan and is enabled through host + // ASan. + bool GPUSan = DriverArgs.hasFlag(options::OPT_fgpu_sanitize, + options::OPT_fno_gpu_sanitize, true) && + getSanitizerArgs(DriverArgs).needsAsanRt(); + return RocmInstallation->getCommonBitcodeLibs( DriverArgs, LibDeviceFile, Wave64, DAZ, FiniteOnly, UnsafeMathOpt, FastRelaxedMath, CorrectSqrt, ABIVer, GPUSan, isOpenMP); @@ -1095,11 +1092,12 @@ bool AMDGPUToolChain::shouldSkipSanitizeOption( if (TargetID.empty()) return false; Option O = A->getOption(); + if (!O.matches(options::OPT_fsanitize_EQ)) return false; if (!DriverArgs.hasFlag(options::OPT_fgpu_sanitize, - options::OPT_fno_gpu_sanitize, false)) + options::OPT_fno_gpu_sanitize, true)) return true; auto &Diags = TC.getDriver().getDiags(); diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp index 00bf9c7338edd..85247f7bd5a9e 100644 --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -68,11 +68,18 @@ llvm::opt::DerivedArgList *AMDGPUOpenMPToolChain::TranslateArgs( Action::OffloadKind DeviceOffloadKind) const { DerivedArgList *DAL = HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind); + if (!DAL) DAL = new DerivedArgList(Args.getBaseArgs()); const OptTable &Opts = getDriver().getOpts(); + // Skip sanitize options passed from the HostTC. Claim them early. + // The decision to sanitize device code is computed only by + // 'shouldSkipSanitizeOption'. + if (DAL->hasArg(options::OPT_fsanitize_EQ)) + DAL->claimAllArgs(options::OPT_fsanitize_EQ); + for (Arg *A : Args) if (!shouldSkipSanitizeOption(*this, Args, BoundArch, A) && !llvm::is_contained(*DAL, A)) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 5deafa2ad0f4a..8fe76de7ff0f0 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2633,6 +2633,7 @@ static void CollectArgsForIntegratedAssembler(Compilation &C, bool UseNoExecStack = false; bool Msa = false; const char *MipsTargetFeature = nullptr; + llvm::SmallVector SparcTargetFeatures; StringRef ImplicitIt; for (const Arg *A : Args.filtered(options::OPT_Wa_COMMA, options::OPT_Xassembler, @@ -2778,6 +2779,31 @@ static void CollectArgsForIntegratedAssembler(Compilation &C, if (MipsTargetFeature) continue; break; + + case llvm::Triple::sparc: + case llvm::Triple::sparcel: + case llvm::Triple::sparcv9: + if (Value == "--undeclared-regs") { + // LLVM already allows undeclared use of G registers, so this option + // becomes a no-op. This solely exists for GNU compatibility. + // TODO implement --no-undeclared-regs + continue; + } + SparcTargetFeatures = + llvm::StringSwitch>(Value) + .Case("-Av8", {"-v8plus"}) + .Case("-Av8plus", {"+v8plus", "+v9"}) + .Case("-Av8plusa", {"+v8plus", "+v9", "+vis"}) + .Case("-Av8plusb", {"+v8plus", "+v9", "+vis", "+vis2"}) + .Case("-Av8plusd", {"+v8plus", "+v9", "+vis", "+vis2", "+vis3"}) + .Case("-Av9", {"+v9"}) + .Case("-Av9a", {"+v9", "+vis"}) + .Case("-Av9b", {"+v9", "+vis", "+vis2"}) + .Case("-Av9d", {"+v9", "+vis", "+vis2", "+vis3"}) + .Default({}); + if (!SparcTargetFeatures.empty()) + continue; + break; } if (Value == "-force_cpusubtype_ALL") { @@ -2882,6 +2908,10 @@ static void CollectArgsForIntegratedAssembler(Compilation &C, CmdArgs.push_back("-target-feature"); CmdArgs.push_back(MipsTargetFeature); } + for (const char *Feature : SparcTargetFeatures) { + CmdArgs.push_back("-target-feature"); + CmdArgs.push_back(Feature); + } // forward -fembed-bitcode to assmebler if (C.getDriver().embedBitcodeEnabled() || diff --git a/clang/lib/Driver/ToolChains/ROCm.h b/clang/lib/Driver/ToolChains/ROCm.h index 681c242b0678e..a6cc41db383b6 100644 --- a/clang/lib/Driver/ToolChains/ROCm.h +++ b/clang/lib/Driver/ToolChains/ROCm.h @@ -178,7 +178,7 @@ class RocmInstallationDetector { const llvm::opt::ArgList &DriverArgs, StringRef LibDeviceFile, bool Wave64, bool DAZ, bool FiniteOnly, bool UnsafeMathOpt, bool FastRelaxedMath, bool CorrectSqrt, DeviceLibABIVersion ABIVer, - const std::tuple &GPUSan, bool isOpenMP) const; + bool GPUSan, bool isOpenMP) const; /// Check file paths of default bitcode libraries common to AMDGPU based /// toolchains. \returns false if there are invalid or missing files. bool checkCommonBitcodeLibs(StringRef GPUArch, StringRef LibDeviceFile, diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 46545aa1f4c07..a5b30c85974c7 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -1596,7 +1596,7 @@ static auto computeNewlines(const AnnotatedLine &Line, if (Line.startsWith(TT_NamespaceRBrace)) { if (Style.WrapNamespaceBodyWithEmptyLines == FormatStyle::WNBWELS_Never) Newlines = 1; - else if (!PreviousLine->startsWith(TT_NamespaceRBrace)) + else if (PreviousLine && !PreviousLine->startsWith(TT_NamespaceRBrace)) Newlines = std::max(Newlines, 2u); } } diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp index 1005825441b3e..2ae355fb33885 100644 --- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp +++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp @@ -974,11 +974,10 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok, // Loop over the contents and print them as a comma-delimited list of // values. bool PrintComma = false; - for (auto Iter = Data->BinaryData.begin(), End = Data->BinaryData.end(); - Iter != End; ++Iter) { + for (unsigned char Byte : Data->BinaryData.bytes()) { if (PrintComma) *Callbacks->OS << ", "; - *Callbacks->OS << static_cast(*Iter); + *Callbacks->OS << static_cast(Byte); PrintComma = true; } } else if (Tok.isAnnotation()) { diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp index c9806a77d5ef6..3af4d001d6c1a 100644 --- a/clang/lib/Sema/HeuristicResolver.cpp +++ b/clang/lib/Sema/HeuristicResolver.cpp @@ -210,37 +210,46 @@ QualType HeuristicResolverImpl::getPointeeType(QualType T) { QualType HeuristicResolverImpl::simplifyType(QualType Type, const Expr *E, bool UnwrapPointer) { bool DidUnwrapPointer = false; - auto SimplifyOneStep = [&](QualType T) { + // A type, together with an optional expression whose type it represents + // which may have additional information about the expression's type + // not stored in the QualType itself. + struct TypeExprPair { + QualType Type; + const Expr *E = nullptr; + }; + TypeExprPair Current{Type, E}; + auto SimplifyOneStep = [UnwrapPointer, &DidUnwrapPointer, + this](TypeExprPair T) -> TypeExprPair { if (UnwrapPointer) { - if (QualType Pointee = getPointeeType(T); !Pointee.isNull()) { + if (QualType Pointee = getPointeeType(T.Type); !Pointee.isNull()) { DidUnwrapPointer = true; - return Pointee; + return {Pointee}; } } - if (const auto *RT = T->getAs()) { + if (const auto *RT = T.Type->getAs()) { // Does not count as "unwrap pointer". - return RT->getPointeeType(); + return {RT->getPointeeType()}; } - if (const auto *BT = T->getAs()) { + if (const auto *BT = T.Type->getAs()) { // If BaseType is the type of a dependent expression, it's just // represented as BuiltinType::Dependent which gives us no information. We // can get further by analyzing the dependent expression. - if (E && BT->getKind() == BuiltinType::Dependent) { - return resolveExprToType(E); + if (T.E && BT->getKind() == BuiltinType::Dependent) { + return {resolveExprToType(T.E), T.E}; } } - if (const auto *AT = T->getContainedAutoType()) { + if (const auto *AT = T.Type->getContainedAutoType()) { // If T contains a dependent `auto` type, deduction will not have // been performed on it yet. In simple cases (e.g. `auto` variable with // initializer), get the approximate type that would result from // deduction. // FIXME: A more accurate implementation would propagate things like the // `const` in `const auto`. - if (E && AT->isUndeducedAutoType()) { - if (const auto *DRE = dyn_cast(E)) { + if (T.E && AT->isUndeducedAutoType()) { + if (const auto *DRE = dyn_cast(T.E)) { if (const auto *VD = dyn_cast(DRE->getDecl())) { - if (VD->hasInit()) - return resolveExprToType(VD->getInit()); + if (auto *Init = VD->getInit()) + return {resolveExprToType(Init), Init}; } } } @@ -251,15 +260,15 @@ QualType HeuristicResolverImpl::simplifyType(QualType Type, const Expr *E, // simplification steps. size_t StepCount = 0; const size_t MaxSteps = 64; - while (!Type.isNull() && StepCount++ < MaxSteps) { - QualType New = SimplifyOneStep(Type); - if (New == Type) + while (!Current.Type.isNull() && StepCount++ < MaxSteps) { + TypeExprPair New = SimplifyOneStep(Current); + if (New.Type == Current.Type) break; - Type = New; + Current = New; } if (UnwrapPointer && !DidUnwrapPointer) return QualType(); - return Type; + return Current.Type; } std::vector HeuristicResolverImpl::resolveMemberExpr( diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 74e0fcec2d911..6eedc77ed20a0 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -19633,23 +19633,6 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl, ProcessAPINotes(Record); } -/// Determine whether the given integral value is representable within -/// the given type T. -static bool isRepresentableIntegerValue(ASTContext &Context, - llvm::APSInt &Value, - QualType T) { - assert((T->isIntegralType(Context) || T->isEnumeralType()) && - "Integral type required!"); - unsigned BitWidth = Context.getIntWidth(T); - - if (Value.isUnsigned() || Value.isNonNegative()) { - if (T->isSignedIntegerOrEnumerationType()) - --BitWidth; - return Value.getActiveBits() <= BitWidth; - } - return Value.getSignificantBits() <= BitWidth; -} - // Given an integral type, return the next larger integral type // (or a NULL type of no such type exists). static QualType getNextLargerIntegralType(ASTContext &Context, QualType T) { @@ -19723,7 +19706,7 @@ EnumConstantDecl *Sema::CheckEnumConstant(EnumDecl *Enum, // representable in the underlying type of the enumeration. In C++11, // we perform a non-narrowing conversion as part of converted constant // expression checking. - if (!isRepresentableIntegerValue(Context, EnumVal, EltTy)) { + if (!Context.isRepresentableIntegerValue(EnumVal, EltTy)) { if (Context.getTargetInfo() .getTriple() .isWindowsMSVCEnvironment()) { @@ -19752,7 +19735,7 @@ EnumConstantDecl *Sema::CheckEnumConstant(EnumDecl *Enum, // representable as an int. // Complain if the value is not representable in an int. - if (!isRepresentableIntegerValue(Context, EnumVal, Context.IntTy)) { + if (!Context.isRepresentableIntegerValue(EnumVal, Context.IntTy)) { Diag(IdLoc, getLangOpts().C23 ? diag::warn_c17_compat_enum_value_not_int : diag::ext_c23_enum_value_not_int) @@ -19844,7 +19827,7 @@ EnumConstantDecl *Sema::CheckEnumConstant(EnumDecl *Enum, : diag::ext_c23_enum_value_not_int) << 1 << toString(EnumVal, 10) << 1; } else if (!getLangOpts().CPlusPlus && !EltTy->isDependentType() && - !isRepresentableIntegerValue(Context, EnumVal, EltTy)) { + !Context.isRepresentableIntegerValue(EnumVal, EltTy)) { // Enforce C99 6.7.2.2p2 even when we compute the next value. Diag(IdLoc, getLangOpts().C23 ? diag::warn_c17_compat_enum_value_not_int : diag::ext_c23_enum_value_not_int) @@ -20171,35 +20154,8 @@ void Sema::ActOnEnumBody(SourceLocation EnumLoc, SourceRange BraceRange, // reverse the list. unsigned NumNegativeBits = 0; unsigned NumPositiveBits = 0; - bool MembersRepresentableByInt = true; - - for (unsigned i = 0, e = Elements.size(); i != e; ++i) { - EnumConstantDecl *ECD = - cast_or_null(Elements[i]); - if (!ECD) continue; // Already issued a diagnostic. - - llvm::APSInt InitVal = ECD->getInitVal(); - - // Keep track of the size of positive and negative values. - if (InitVal.isUnsigned() || InitVal.isNonNegative()) { - // If the enumerator is zero that should still be counted as a positive - // bit since we need a bit to store the value zero. - unsigned ActiveBits = InitVal.getActiveBits(); - NumPositiveBits = std::max({NumPositiveBits, ActiveBits, 1u}); - } else { - NumNegativeBits = - std::max(NumNegativeBits, (unsigned)InitVal.getSignificantBits()); - } - MembersRepresentableByInt &= - isRepresentableIntegerValue(Context, InitVal, Context.IntTy); - } - - // If we have an empty set of enumerators we still need one bit. - // From [dcl.enum]p8 - // If the enumerator-list is empty, the values of the enumeration are as if - // the enumeration had a single enumerator with value 0 - if (!NumPositiveBits && !NumNegativeBits) - NumPositiveBits = 1; + bool MembersRepresentableByInt = + Context.computeEnumBits(Elements, NumNegativeBits, NumPositiveBits); // Figure out the type that should be used for this enum. QualType BestType; diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 376995d624e28..39ce65381a98c 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -5078,7 +5078,8 @@ static bool checkIfClauses(Sema &S, OpenMPDirectiveKind Kind, // At most one if clause without a directive-name-modifier can appear on // the directive. OpenMPDirectiveKind CurNM = IC->getNameModifier(); - if (FoundNameModifiers[CurNM]) { + auto &FNM = FoundNameModifiers[CurNM]; + if (FNM) { S.Diag(C->getBeginLoc(), diag::err_omp_more_one_clause) << getOpenMPDirectiveName(Kind) << getOpenMPClauseName(OMPC_if) << (CurNM != OMPD_unknown) << getOpenMPDirectiveName(CurNM); @@ -5087,7 +5088,7 @@ static bool checkIfClauses(Sema &S, OpenMPDirectiveKind Kind, NameModifierLoc.push_back(IC->getNameModifierLoc()); ++NamedModifiersNumber; } - FoundNameModifiers[CurNM] = IC; + FNM = IC; if (CurNM == OMPD_unknown) continue; // Check if the specified name modifier is allowed for the current @@ -6759,16 +6760,15 @@ SemaOpenMP::DeclGroupPtrTy SemaOpenMP::ActOnOpenMPDeclareSimdDirective( ->getCanonicalDecl() == CanonPVD) { // OpenMP [2.8.1, simd construct, Restrictions] // A list-item cannot appear in more than one aligned clause. - if (AlignedArgs.count(CanonPVD) > 0) { + auto [It, Inserted] = AlignedArgs.try_emplace(CanonPVD, E); + if (!Inserted) { Diag(E->getExprLoc(), diag::err_omp_used_in_clause_twice) << 1 << getOpenMPClauseName(OMPC_aligned) << E->getSourceRange(); - Diag(AlignedArgs[CanonPVD]->getExprLoc(), - diag::note_omp_explicit_dsa) + Diag(It->second->getExprLoc(), diag::note_omp_explicit_dsa) << getOpenMPClauseName(OMPC_aligned); continue; } - AlignedArgs[CanonPVD] = E; QualType QTy = PVD->getType() .getNonReferenceType() .getUnqualifiedType() diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index e5931f4684a57..b789824d97020 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -998,8 +998,6 @@ getRHSTemplateDeclAndArgs(Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate) { Template = CTSD->getSpecializedTemplate(); AliasRhsTemplateArgs = CTSD->getTemplateArgs().asArray(); } - } else { - assert(false && "unhandled RHS type of the alias"); } return {Template, AliasRhsTemplateArgs}; } diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp index 7460781799d08..db784f2cc77b2 100644 --- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp @@ -145,7 +145,8 @@ using MutexDescriptor = std::variant; -class BlockInCriticalSectionChecker : public Checker { +class BlockInCriticalSectionChecker + : public Checker { private: const std::array MutexDescriptors{ // NOTE: There are standard library implementations where some methods @@ -179,6 +180,8 @@ class BlockInCriticalSectionChecker : public Checker { {CDM::CLibrary, {"read"}}, {CDM::CLibrary, {"recv"}}}; + const CallDescription OpenFunction{CDM::CLibrary, {"open"}, 2}; + const BugType BlockInCritSectionBugType{ this, "Call to blocking function in critical section", "Blocking Error"}; @@ -197,6 +200,8 @@ class BlockInCriticalSectionChecker : public Checker { void handleUnlock(const MutexDescriptor &Mutex, const CallEvent &Call, CheckerContext &C) const; + void handleOpen(const CallEvent &Call, CheckerContext &C) const; + [[nodiscard]] bool isBlockingInCritSection(const CallEvent &Call, CheckerContext &C) const; @@ -205,11 +210,14 @@ class BlockInCriticalSectionChecker : public Checker { /// Process lock. /// Process blocking functions (sleep, getc, fgets, read, recv) void checkPostCall(const CallEvent &Call, CheckerContext &C) const; + + void checkDeadSymbols(SymbolReaper &SymReaper, CheckerContext &C) const; }; } // end anonymous namespace REGISTER_LIST_WITH_PROGRAMSTATE(ActiveCritSections, CritSectionMarker) +REGISTER_SET_WITH_PROGRAMSTATE(NonBlockFileDescriptor, SymbolRef) // Iterator traits for ImmutableList data structure // that enable the use of STL algorithms. @@ -306,6 +314,25 @@ void BlockInCriticalSectionChecker::handleUnlock( C.addTransition(State); } +void BlockInCriticalSectionChecker::handleOpen(const CallEvent &Call, + CheckerContext &C) const { + const auto *Flag = Call.getArgExpr(1); + static std::optional ValueOfONonBlockVFlag = + tryExpandAsInteger("O_NONBLOCK", C.getBugReporter().getPreprocessor()); + if (!ValueOfONonBlockVFlag) + return; + + SVal FlagSV = C.getState()->getSVal(Flag, C.getLocationContext()); + const llvm::APSInt *FlagV = FlagSV.getAsInteger(); + if (!FlagV) + return; + + if ((*FlagV & ValueOfONonBlockVFlag.value()) != 0) + if (SymbolRef SR = Call.getReturnValue().getAsSymbol()) { + C.addTransition(C.getState()->add(SR)); + } +} + bool BlockInCriticalSectionChecker::isBlockingInCritSection( const CallEvent &Call, CheckerContext &C) const { return BlockingFunctions.contains(Call) && @@ -315,6 +342,27 @@ bool BlockInCriticalSectionChecker::isBlockingInCritSection( void BlockInCriticalSectionChecker::checkPostCall(const CallEvent &Call, CheckerContext &C) const { if (isBlockingInCritSection(Call, C)) { + // for 'read' and 'recv' call, check whether it's file descriptor(first + // argument) is + // created by 'open' API with O_NONBLOCK flag or is equal to -1, they will + // not cause block in these situations, don't report + StringRef FuncName = Call.getCalleeIdentifier()->getName(); + if (FuncName == "read" || FuncName == "recv") { + const auto *Arg = Call.getArgExpr(0); + if (!Arg) + return; + + SVal SV = C.getSVal(Arg); + if (const auto *IntValue = SV.getAsInteger()) { + if (*IntValue == -1) + return; + } + + SymbolRef SR = C.getSVal(Arg).getAsSymbol(); + if (SR && C.getState()->contains(SR)) { + return; + } + } reportBlockInCritSection(Call, C); } else if (std::optional LockDesc = checkDescriptorMatch(Call, C, /*IsLock=*/true)) { @@ -322,9 +370,26 @@ void BlockInCriticalSectionChecker::checkPostCall(const CallEvent &Call, } else if (std::optional UnlockDesc = checkDescriptorMatch(Call, C, /*IsLock=*/false)) { handleUnlock(*UnlockDesc, Call, C); + } else if (OpenFunction.matches(Call)) { + handleOpen(Call, C); } } +void BlockInCriticalSectionChecker::checkDeadSymbols(SymbolReaper &SymReaper, + CheckerContext &C) const { + ProgramStateRef State = C.getState(); + + // Remove the dead symbols from the NonBlockFileDescriptor set. + NonBlockFileDescriptorTy Tracked = State->get(); + for (SymbolRef SR : Tracked) { + if (SymReaper.isDead(SR)) { + State = State->remove(SR); + } + } + + C.addTransition(State); +} + void BlockInCriticalSectionChecker::reportBlockInCritSection( const CallEvent &Call, CheckerContext &C) const { ExplodedNode *ErrNode = C.generateNonFatalErrorNode(C.getState()); diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp index abf5d3ec193a4..a12853d01819f 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp @@ -154,19 +154,20 @@ bool isConstOwnerPtrMemberExpr(const clang::Expr *E) { if (auto *MCE = dyn_cast(E)) { if (auto *Callee = MCE->getDirectCallee()) { auto Name = safeGetName(Callee); - if (Name == "get" || Name == "ptr") { - auto *ThisArg = MCE->getImplicitObjectArgument(); - E = ThisArg; - } + if (Name == "get" || Name == "ptr") + E = MCE->getImplicitObjectArgument(); + if (isa(Callee)) + E = MCE->getImplicitObjectArgument(); } } else if (auto *OCE = dyn_cast(E)) { if (OCE->getOperator() == OO_Star && OCE->getNumArgs() == 1) E = OCE->getArg(0); } - auto *ME = dyn_cast(E); - if (!ME) - return false; - auto *D = ME->getMemberDecl(); + const ValueDecl *D = nullptr; + if (auto *ME = dyn_cast(E)) + D = ME->getMemberDecl(); + else if (auto *IVR = dyn_cast(E)) + D = IVR->getDecl(); if (!D) return false; auto T = D->getType(); diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index e3ec7c57571c8..c799687ccb44f 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -2813,13 +2813,24 @@ void ExprEngine::processBranch( if (StTrue && StFalse) assert(!isa(Condition)); + // We want to ensure consistent behavior between `eagerly-assume=false`, + // when the state split is always performed by the `assumeCondition()` + // call within this function and `eagerly-assume=true` (the default), when + // some conditions (comparison operators, unary negation) can trigger a + // state split before this callback. There are some contrived corner cases + // that behave differently with and without `eagerly-assume`, but I don't + // know about an example that could plausibly appear in "real" code. + bool BothFeasible = + (StTrue && StFalse) || + didEagerlyAssumeBifurcateAt(PrevState, dyn_cast(Condition)); + if (StTrue) { - // If we are processing a loop condition where two iterations have - // already been completed and the false branch is also feasible, then - // don't assume a third iteration because it is a redundant execution - // path (unlikely to be different from earlier loop exits) and can cause - // false positives if e.g. the loop iterates over a two-element structure - // with an opaque condition. + // In a loop, if both branches are feasible (i.e. the analyzer doesn't + // understand the loop condition) and two iterations have already been + // completed, then don't assume a third iteration because it is a + // redundant execution path (unlikely to be different from earlier loop + // exits) and can cause false positives if e.g. the loop iterates over a + // two-element structure with an opaque condition. // // The iteration count "2" is hardcoded because it's the natural limit: // * the fact that the programmer wrote a loop (and not just an `if`) @@ -2830,10 +2841,7 @@ void ExprEngine::processBranch( // two iterations". (This pattern is common in FFMPEG and appears in // many other projects as well.) bool CompletedTwoIterations = IterationsCompletedInLoop.value_or(0) >= 2; - bool FalseAlsoFeasible = - StFalse || - didEagerlyAssumeBifurcateAt(PrevState, dyn_cast(Condition)); - bool SkipTrueBranch = CompletedTwoIterations && FalseAlsoFeasible; + bool SkipTrueBranch = BothFeasible && CompletedTwoIterations; // FIXME: This "don't assume third iteration" heuristic partially // conflicts with the widen-loop analysis option (which is off by @@ -2843,8 +2851,25 @@ void ExprEngine::processBranch( Builder.generateNode(StTrue, true, PredN); } - if (StFalse) - Builder.generateNode(StFalse, false, PredN); + if (StFalse) { + // In a loop, if both branches are feasible (i.e. the analyzer doesn't + // understand the loop condition), we are before the first iteration and + // the analyzer option `assume-at-least-one-iteration` is set to `true`, + // then avoid creating the execution path where the loop is skipped. + // + // In some situations this "loop is skipped" execution path is an + // important corner case that may evade the notice of the developer and + // hide significant bugs -- however, there are also many situations where + // it's guaranteed that at least one iteration will happen (e.g. some + // data structure is always nonempty), but the analyzer cannot realize + // this and will produce false positives when it assumes that the loop is + // skipped. + bool BeforeFirstIteration = IterationsCompletedInLoop == std::optional{0}; + bool SkipFalseBranch = BothFeasible && BeforeFirstIteration && + AMgr.options.ShouldAssumeAtLeastOneIteration; + if (!SkipFalseBranch) + Builder.generateNode(StFalse, false, PredN); + } } currBldrCtx = nullptr; } diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index e60ff894c9715..31f066b37858d 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -840,10 +840,17 @@ template struct SS { constexpr SS(unsigned long long N) : data(nullptr){ - data = alloc.allocate(N); // #call + data = alloc.allocate(N); for(std::size_t i = 0; i < N; i ++) - std::construct_at(data + i, i); // #construct_call + std::construct_at(data + i, i); } + + constexpr SS() + : data(nullptr){ + data = alloc.allocate(1); + std::construct_at(data); + } + constexpr T operator[](std::size_t i) const { return data[i]; } @@ -855,6 +862,7 @@ struct SS { T* data; }; constexpr unsigned short ssmall = SS(100)[42]; +constexpr auto Ss = SS()[0]; diff --git a/clang/test/AST/ast-dump-templates.cpp b/clang/test/AST/ast-dump-templates.cpp index 86af8c50f3174..2728dc151c3c5 100644 --- a/clang/test/AST/ast-dump-templates.cpp +++ b/clang/test/AST/ast-dump-templates.cpp @@ -1,12 +1,12 @@ -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++17 -ast-dump=json %s | FileCheck --check-prefix=JSON %s -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++17 -ast-print %s > %t +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++20 -ast-dump=json %s | FileCheck --check-prefix=JSON %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++20 -ast-print %s > %t // RUN: FileCheck < %t %s -check-prefix=CHECK1 // RUN: FileCheck < %t %s -check-prefix=CHECK2 -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++17 -ast-dump %s | FileCheck --check-prefix=DUMP %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++20 -ast-dump %s | FileCheck --check-prefix=DUMP %s // Test with serialization: -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++17 -emit-pch -o %t %s -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -x c++ -std=c++17 -include-pch %t \ +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++20 -emit-pch -o %t %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -x c++ -std=c++20 -include-pch %t \ // RUN: -ast-dump-all /dev/null \ // RUN: | sed -e "s/ //" -e "s/ imported//" \ // RUN: | FileCheck --strict-whitespace --check-prefix=DUMP %s @@ -135,6 +135,17 @@ namespace test7 { // DUMP: ClassTemplateSpecializationDecl {{.*}} struct A definition explicit_instantiation_definition strict-pack-match{{$}} } // namespce test7 +namespace test8 { +template<_Complex int x> +struct pr126341; +template<> +struct pr126341<{1, 2}>; +// DUMP-LABEL: NamespaceDecl {{.*}} test8{{$}} +// DUMP-NEXT: |-ClassTemplateDecl {{.*}} pr126341 +// DUMP: `-ClassTemplateSpecializationDecl {{.*}} pr126341 +// DUMP: `-TemplateArgument structural value '1+2i' +} // namespace test8 + // NOTE: CHECK lines have been autogenerated by gen_ast_dump_json_test.py @@ -486,6 +497,7 @@ namespace test7 { // JSON-NEXT: "trivial": true // JSON-NEXT: }, // JSON-NEXT: "defaultCtor": { +// JSON-NEXT: "defaultedIsConstexpr": true, // JSON-NEXT: "exists": true, // JSON-NEXT: "nonTrivial": true, // JSON-NEXT: "userProvided": true @@ -819,6 +831,7 @@ namespace test7 { // JSON-NEXT: "trivial": true // JSON-NEXT: }, // JSON-NEXT: "defaultCtor": { +// JSON-NEXT: "defaultedIsConstexpr": true, // JSON-NEXT: "exists": true, // JSON-NEXT: "nonTrivial": true, // JSON-NEXT: "userProvided": true @@ -1408,6 +1421,7 @@ namespace test7 { // JSON-NEXT: "qualType": "void () noexcept" // JSON-NEXT: }, // JSON-NEXT: "inline": true, +// JSON-NEXT: "constexpr": true, // JSON-NEXT: "explicitlyDefaulted": "default" // JSON-NEXT: } // JSON-NEXT: ] @@ -1454,6 +1468,7 @@ namespace test7 { // JSON-NEXT: "trivial": true // JSON-NEXT: }, // JSON-NEXT: "defaultCtor": { +// JSON-NEXT: "defaultedIsConstexpr": true, // JSON-NEXT: "exists": true, // JSON-NEXT: "nonTrivial": true, // JSON-NEXT: "userProvided": true @@ -2067,6 +2082,7 @@ namespace test7 { // JSON-NEXT: "qualType": "void () noexcept" // JSON-NEXT: }, // JSON-NEXT: "inline": true, +// JSON-NEXT: "constexpr": true, // JSON-NEXT: "explicitlyDefaulted": "default" // JSON-NEXT: } // JSON-NEXT: ] @@ -6158,6 +6174,148 @@ namespace test7 { // JSON-NEXT: ] // JSON-NEXT: } // JSON-NEXT: ] +// JSON-NEXT: }, +// JSON-NEXT: { +// JSON-NEXT: "id": "0x{{.*}}", +// JSON-NEXT: "kind": "NamespaceDecl", +// JSON-NEXT: "loc": { +// JSON-NEXT: "offset": 4339, +// JSON-NEXT: "line": 138, +// JSON-NEXT: "col": 11, +// JSON-NEXT: "tokLen": 5 +// JSON-NEXT: }, +// JSON-NEXT: "range": { +// JSON-NEXT: "begin": { +// JSON-NEXT: "offset": 4329, +// JSON-NEXT: "col": 1, +// JSON-NEXT: "tokLen": 9 +// JSON-NEXT: }, +// JSON-NEXT: "end": { +// JSON-NEXT: "offset": 4648, +// JSON-NEXT: "line": 147, +// JSON-NEXT: "col": 1, +// JSON-NEXT: "tokLen": 1 +// JSON-NEXT: } +// JSON-NEXT: }, +// JSON-NEXT: "name": "test8", +// JSON-NEXT: "inner": [ +// JSON-NEXT: { +// JSON-NEXT: "id": "0x{{.*}}", +// JSON-NEXT: "kind": "ClassTemplateDecl", +// JSON-NEXT: "loc": { +// JSON-NEXT: "offset": 4379, +// JSON-NEXT: "line": 140, +// JSON-NEXT: "col": 8, +// JSON-NEXT: "tokLen": 8 +// JSON-NEXT: }, +// JSON-NEXT: "range": { +// JSON-NEXT: "begin": { +// JSON-NEXT: "offset": 4347, +// JSON-NEXT: "line": 139, +// JSON-NEXT: "col": 1, +// JSON-NEXT: "tokLen": 8 +// JSON-NEXT: }, +// JSON-NEXT: "end": { +// JSON-NEXT: "offset": 4379, +// JSON-NEXT: "line": 140, +// JSON-NEXT: "col": 8, +// JSON-NEXT: "tokLen": 8 +// JSON-NEXT: } +// JSON-NEXT: }, +// JSON-NEXT: "name": "pr126341", +// JSON-NEXT: "inner": [ +// JSON-NEXT: { +// JSON-NEXT: "id": "0x{{.*}}", +// JSON-NEXT: "kind": "NonTypeTemplateParmDecl", +// JSON-NEXT: "loc": { +// JSON-NEXT: "offset": 4369, +// JSON-NEXT: "line": 139, +// JSON-NEXT: "col": 23, +// JSON-NEXT: "tokLen": 1 +// JSON-NEXT: }, +// JSON-NEXT: "range": { +// JSON-NEXT: "begin": { +// JSON-NEXT: "offset": 4356, +// JSON-NEXT: "col": 10, +// JSON-NEXT: "tokLen": 8 +// JSON-NEXT: }, +// JSON-NEXT: "end": { +// JSON-NEXT: "offset": 4369, +// JSON-NEXT: "col": 23, +// JSON-NEXT: "tokLen": 1 +// JSON-NEXT: } +// JSON-NEXT: }, +// JSON-NEXT: "name": "x", +// JSON-NEXT: "type": { +// JSON-NEXT: "qualType": "_Complex int" +// JSON-NEXT: }, +// JSON-NEXT: "depth": 0, +// JSON-NEXT: "index": 0 +// JSON-NEXT: }, +// JSON-NEXT: { +// JSON-NEXT: "id": "0x{{.*}}", +// JSON-NEXT: "kind": "CXXRecordDecl", +// JSON-NEXT: "loc": { +// JSON-NEXT: "offset": 4379, +// JSON-NEXT: "line": 140, +// JSON-NEXT: "col": 8, +// JSON-NEXT: "tokLen": 8 +// JSON-NEXT: }, +// JSON-NEXT: "range": { +// JSON-NEXT: "begin": { +// JSON-NEXT: "offset": 4372, +// JSON-NEXT: "col": 1, +// JSON-NEXT: "tokLen": 6 +// JSON-NEXT: }, +// JSON-NEXT: "end": { +// JSON-NEXT: "offset": 4379, +// JSON-NEXT: "col": 8, +// JSON-NEXT: "tokLen": 8 +// JSON-NEXT: } +// JSON-NEXT: }, +// JSON-NEXT: "name": "pr126341", +// JSON-NEXT: "tagUsed": "struct" +// JSON-NEXT: }, +// JSON-NEXT: { +// JSON-NEXT: "id": "0x{{.*}}", +// JSON-NEXT: "kind": "ClassTemplateSpecializationDecl", +// JSON-NEXT: "name": "pr126341" +// JSON-NEXT: } +// JSON-NEXT: ] +// JSON-NEXT: }, +// JSON-NEXT: { +// JSON-NEXT: "id": "0x{{.*}}", +// JSON-NEXT: "kind": "ClassTemplateSpecializationDecl", +// JSON-NEXT: "loc": { +// JSON-NEXT: "offset": 4407, +// JSON-NEXT: "line": 142, +// JSON-NEXT: "col": 8, +// JSON-NEXT: "tokLen": 8 +// JSON-NEXT: }, +// JSON-NEXT: "range": { +// JSON-NEXT: "begin": { +// JSON-NEXT: "offset": 4389, +// JSON-NEXT: "line": 141, +// JSON-NEXT: "col": 1, +// JSON-NEXT: "tokLen": 8 +// JSON-NEXT: }, +// JSON-NEXT: "end": { +// JSON-NEXT: "offset": 4422, +// JSON-NEXT: "line": 142, +// JSON-NEXT: "col": 23, +// JSON-NEXT: "tokLen": 1 +// JSON-NEXT: } +// JSON-NEXT: }, +// JSON-NEXT: "name": "pr126341", +// JSON-NEXT: "tagUsed": "struct", +// JSON-NEXT: "inner": [ +// JSON-NEXT: { +// JSON-NEXT: "kind": "TemplateArgument", +// JSON-NEXT: "value": "1+2i" +// JSON-NEXT: } +// JSON-NEXT: ] +// JSON-NEXT: } +// JSON-NEXT: ] // JSON-NEXT: } // JSON-NEXT: ] // JSON-NEXT: } diff --git a/clang/test/Analysis/Checkers/WebKit/call-args-counted-const-member.cpp b/clang/test/Analysis/Checkers/WebKit/call-args-counted-const-member.cpp index 215238a7fcf07..8da415a818a82 100644 --- a/clang/test/Analysis/Checkers/WebKit/call-args-counted-const-member.cpp +++ b/clang/test/Analysis/Checkers/WebKit/call-args-counted-const-member.cpp @@ -31,6 +31,7 @@ class Foo { public: Foo(); void bar(); + RefCountable& obj1() const { return m_obj1; } private: const Ref m_obj1; @@ -41,6 +42,7 @@ void Foo::bar() { m_obj1->method(); m_obj2->method(); // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} + obj1().method(); } } // namespace call_args_const_ref_member @@ -100,6 +102,7 @@ class Foo { public: Foo(); void bar(); + RefCountable& obj1() { return m_obj1; } private: const UniqueRef m_obj1; @@ -110,6 +113,7 @@ void Foo::bar() { m_obj1->method(); m_obj2->method(); // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} + obj1().method(); } } // namespace call_args_const_unique_ref diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h index 85397c2d25951..a1f0cc8b046b9 100644 --- a/clang/test/Analysis/Checkers/WebKit/mock-types.h +++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h @@ -289,6 +289,7 @@ class UniqueRef { u.t = nullptr; } T &get() const { return *t; } + operator T&() const { return *t; } T *operator->() const { return t; } UniqueRef &operator=(T &) { return *this; } }; diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm index 9ad1880e9d118..08319016023e3 100644 --- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm @@ -1,11 +1,14 @@ // RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s -// expected-no-diagnostics #import "mock-types.h" #import "mock-system-header.h" #import "../../Inputs/system-header-simulator-for-objc-dealloc.h" -@interface Foo : NSObject +@interface Foo : NSObject { + const Ref _obj1; + const RefPtr _obj2; + Ref _obj3; +} @property (nonatomic, readonly) RefPtr countable; @@ -17,6 +20,11 @@ @implementation Foo - (void)execute { self._protectedRefCountable->method(); + _obj1->method(); + _obj1.get().method(); + (*_obj2).method(); + _obj3->method(); + // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}} } - (RefPtr)_protectedRefCountable { @@ -30,6 +38,7 @@ - (void)execute { void ref() const; void deref() const; Ref copy() const; + void method(); }; @interface WrapperObj : NSObject diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c index d5eb790b82f23..f6a49680917ac 100644 --- a/clang/test/Analysis/analyzer-config.c +++ b/clang/test/Analysis/analyzer-config.c @@ -10,6 +10,7 @@ // CHECK-NEXT: alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling = false // CHECK-NEXT: alpha.osx.cocoa.DirectIvarAssignment:AnnotatedFunctions = false // CHECK-NEXT: apply-fixits = false +// CHECK-NEXT: assume-at-least-one-iteration = false // CHECK-NEXT: assume-controlled-environment = false // CHECK-NEXT: avoid-suppressing-null-argument-paths = false // CHECK-NEXT: c++-allocator-inlining = true diff --git a/clang/test/Analysis/issue-124474.cpp b/clang/test/Analysis/issue-124474.cpp new file mode 100644 index 0000000000000..09e3d4f3f9ad9 --- /dev/null +++ b/clang/test/Analysis/issue-124474.cpp @@ -0,0 +1,49 @@ +// RUN: %clang_analyze_cc1 \ +// RUN: -analyzer-checker=unix.BlockInCriticalSection \ +// RUN: -std=c++11 \ +// RUN: -analyzer-output text \ +// RUN: -verify %s + +// expected-no-diagnostics + +namespace std { + struct mutex { + void lock() {} + void unlock() {} + }; + template + struct lock_guard { + lock_guard(std::mutex) {} + ~lock_guard() {} + }; + template + struct unique_lock { + unique_lock(std::mutex) {} + ~unique_lock() {} + }; + template + struct not_real_lock { + not_real_lock(std::mutex) {} + }; + } // namespace std + +std::mutex mtx; +using ssize_t = long long; +using size_t = unsigned long long; +int open (const char *__file, int __oflag, ...); +ssize_t read(int fd, void *buf, size_t count); +void close(int fd); +#define O_RDONLY 00 +# define O_NONBLOCK 04000 + +void foo() +{ + std::lock_guard lock(mtx); + + const char *filename = "example.txt"; + int fd = open(filename, O_RDONLY | O_NONBLOCK); + + char buffer[200] = {}; + read(fd, buffer, 199); // no-warning: fd is a non-block file descriptor + close(fd); +} diff --git a/clang/test/Analysis/live-stmts.cpp b/clang/test/Analysis/live-stmts.cpp index 33b8d59305d3d..9cac815e65de1 100644 --- a/clang/test/Analysis/live-stmts.cpp +++ b/clang/test/Analysis/live-stmts.cpp @@ -1,5 +1,5 @@ -// Flaky on aarch64: http://llvm.org/PR126619 -// UNSUPPORTED: target=aarch64{{.*}} +// Disabling this flaky test, see https://github.com/llvm/llvm-project/pull/126913#issuecomment-2655850766 +// UNSUPPORTED: true // RUN: %clang_analyze_cc1 -w -analyzer-checker=debug.DumpLiveExprs %s 2>&1\ // RUN: | FileCheck %s @@ -29,34 +29,36 @@ int testThatDumperWorks(int x, int y, int z) { // CHECK-EMPTY: // CHECK: [ B2 (live expressions at block exit) ] // CHECK-EMPTY: -// CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' -// CHECK-EMPTY: -// CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' -// CHECK-EMPTY: // CHECK-NEXT: ImplicitCastExpr {{.*}} // CHECK-NEXT: `-ImplicitCastExpr {{.*}} // CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' // CHECK-EMPTY: -// CHECK-EMPTY: -// CHECK: [ B3 (live expressions at block exit) ] -// CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' // CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' // CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK: [ B3 (live expressions at block exit) ] +// CHECK-EMPTY: // CHECK-NEXT: ImplicitCastExpr {{.*}} // CHECK-NEXT: `-ImplicitCastExpr {{.*}} // CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' -// CHECK: [ B4 (live expressions at block exit) ] // CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' // CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' // CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK: [ B4 (live expressions at block exit) ] +// CHECK-EMPTY: // CHECK-NEXT: ImplicitCastExpr {{.*}} // CHECK-NEXT: `-ImplicitCastExpr {{.*}} // CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' // CHECK-EMPTY: +// CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' +// CHECK-EMPTY: +// CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' +// CHECK-EMPTY: // CHECK-EMPTY: // CHECK: [ B5 (live expressions at block exit) ] // CHECK-EMPTY: @@ -226,15 +228,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 @@ -245,15 +247,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 @@ -264,15 +266,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 @@ -283,15 +285,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 diff --git a/clang/test/Analysis/loop-assumptions.c b/clang/test/Analysis/loop-assumptions.c index eb0ffdce722e0..b61ed8815e3f6 100644 --- a/clang/test/Analysis/loop-assumptions.c +++ b/clang/test/Analysis/loop-assumptions.c @@ -1,25 +1,48 @@ // RUN: %clang_analyze_cc1 -analyzer-checker=debug.ExprInspection \ -// RUN: -verify=expected,eagerlyassume %s +// RUN: -verify=expected,noassumeone,eagerlyassume,combo %s // RUN: %clang_analyze_cc1 -analyzer-checker=debug.ExprInspection \ // RUN: -analyzer-config eagerly-assume=false \ +// RUN: -verify=expected,noassumeone,noeagerlyassume,combo %s +// RUN: %clang_analyze_cc1 -analyzer-checker=debug.ExprInspection \ +// RUN: -analyzer-config assume-at-least-one-iteration=true \ +// RUN: -verify=expected,eagerlyassume,combo %s +// RUN: %clang_analyze_cc1 -analyzer-checker=debug.ExprInspection \ +// RUN: -analyzer-config assume-at-least-one-iteration=true,eagerly-assume=false \ // RUN: -verify=expected,noeagerlyassume %s +// The verify tag "combo" is used for one unique warning which is produced in three +// of the four RUN combinations. + // These tests validate the logic within `ExprEngine::processBranch` which // ensures that in loops with opaque conditions we don't assume execution paths // if the code does not imply that they are possible. +// In particular, if two (or more) iterations are already completed in a loop, +// we don't assume that there can be another iteration. Moreover, if the +// analyzer option `assume-at-least-one-iteration` is enabled, then we don't +// assume that a loop can be skipped completely. void clang_analyzer_numTimesReached(void); -void clang_analyzer_warnIfReached(void); void clang_analyzer_dump(int); -void clearCondition(void) { - // If the analyzer can definitely determine the value of the loop condition, +void clearTrueCondition(void) { + // If the analyzer can definitely determine that the loop condition is true, // then this corrective logic doesn't activate and the engine executes // `-analyzer-max-loop` iterations (by default, 4). - for (int i = 0; i < 10; i++) + int i; + for (i = 0; i < 10; i++) clang_analyzer_numTimesReached(); // expected-warning {{4}} - clang_analyzer_warnIfReached(); // unreachable + clang_analyzer_dump(i); // Unreachable, no reports. +} + +void clearFalseCondition(void) { + // If the analyzer can definitely determine that the loop condition is false, + // then the loop is skipped, even in `assume-at-least-one-iteration` mode. + int i; + for (i = 0; i > 10; i++) + clang_analyzer_numTimesReached(); // Unreachable, no report. + + clang_analyzer_dump(i); // expected-warning {{0}} } void opaqueCondition(int arg) { @@ -28,10 +51,13 @@ void opaqueCondition(int arg) { // that more than two iterations are possible. (It _does_ imply that two // iterations may be possible at least in some cases, because otherwise an // `if` would've been enough.) - for (int i = 0; i < arg; i++) + // Moreover, if `assume-at-least-one-iteration` is enabled, then assume at + // least one iteration. + int i; + for (i = 0; i < arg; i++) clang_analyzer_numTimesReached(); // expected-warning {{2}} - clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} + clang_analyzer_dump(i); // noassumeone-warning {{0}} expected-warning {{1}} expected-warning {{2}} } int check(void); @@ -42,22 +68,26 @@ void opaqueConditionCall(int arg) { // insert an assertion to guide the analyzer and rule out more than two // iterations (so the analyzer needs to proactively avoid those unjustified // branches). - while (check()) + int i = 0; // Helper to distinguish the the branches after the loop. + while (check()) { clang_analyzer_numTimesReached(); // expected-warning {{2}} + i++; + } - clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} + clang_analyzer_dump(i); // noassumeone-warning {{0}} expected-warning {{1}} expected-warning {{2}} } void opaqueConditionDoWhile(int arg) { // Same situation as `opaqueCondition()` but with a `do {} while ()` loop. // This is tested separately because this loop type is a special case in the // iteration count calculation. + // Obviously, this loop guarantees that at least one iteration will happen. int i = 0; do { clang_analyzer_numTimesReached(); // expected-warning {{2}} } while (i++ < arg); - clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} + clang_analyzer_dump(i); // expected-warning {{1}} expected-warning {{2}} } void dontRememberOldBifurcation(int arg) { @@ -69,7 +99,7 @@ void dontRememberOldBifurcation(int arg) { // by default), because the code remembered that there was a bifurcation on // the first iteration of the loop and didn't realize that this is obsolete. - // NOTE: The variable `i` is introduced to ensure that the iterations of the + // NOTE: The variable `i` is significant to ensure that the iterations of the // loop change the state -- otherwise the analyzer stops iterating because it // returns to the same `ExplodedNode`. int i = 0; @@ -78,10 +108,12 @@ void dontRememberOldBifurcation(int arg) { i++; } - clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} + clang_analyzer_dump(i); // noassumeone-warning {{0}} } void dontAssumeFourthIterartion(int arg) { + int i; + if (arg == 2) return; @@ -89,10 +121,10 @@ void dontAssumeFourthIterartion(int arg) { // iterations (because it knows that `arg != 2` at that point), so it // performs a third iteration, but it does not assume that a fourth iteration // is also possible. - for (int i = 0; i < arg; i++) + for (i = 0; i < arg; i++) clang_analyzer_numTimesReached(); // expected-warning {{3}} - clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} + clang_analyzer_dump(i); // noassumeone-warning {{0}} expected-warning {{1}} expected-warning {{3}} } #define TRUE 1 @@ -108,42 +140,53 @@ void shortCircuitInLoopCondition(int arg) { // false positive on the ffmpeg codebase. Eventually we should properly // recognize the full syntactical loop condition expression as "the loop // condition", but this will be complicated to implement. - for (int i = 0; i < arg && TRUE; i++) { + int i; + for (i = 0; i < arg && TRUE; i++) { clang_analyzer_numTimesReached(); // expected-warning {{4}} } - clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} + + clang_analyzer_dump(i); // expected-warning {{0}} expected-warning {{1}} expected-warning {{2}} expected-warning {{3}} } void shortCircuitInLoopConditionRHS(int arg) { // Unlike `shortCircuitInLoopCondition()`, this case is handled properly // because the analyzer thinks that the right hand side of the `&&` is the // loop condition. - for (int i = 0; TRUE && i < arg; i++) { + int i; + for (i = 0; TRUE && i < arg; i++) { clang_analyzer_numTimesReached(); // expected-warning {{2}} } - clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} + + clang_analyzer_dump(i); // noassumeone-warning {{0}} expected-warning {{1}} expected-warning {{2}} } void eagerlyAssumeInSubexpression(int arg) { // The `EagerlyAssume` logic is another complication that can "split the // state" within the loop condition, but before the `processBranch()` call - // which is (in theory) responsible for evaluating the loop condition. - // The current implementation partially compensates this by noticing the + // which would be "naturally" responsible for evaluating the loop condition. + // The current implementation tries to handle this by noticing the // cases where the loop condition is targeted by `EagerlyAssume`, but does // not handle the (fortunately rare) case when `EagerlyAssume` hits a // sub-expression of the loop condition (as in this contrived test case). - // FIXME: I don't know a real-world example for this inconsistency, but it - // would be good to eliminate it eventually. - for (int i = 0; (i >= arg) - 1; i++) { + // FIXME: It would be good to eventually eliminate this inconsistency, but + // I don't know a realistic example that could appear in real-world code, so + // this seems to be a low-priority goal. + int i; + for (i = 0; (i >= arg) - 1; i++) { clang_analyzer_numTimesReached(); // eagerlyassume-warning {{4}} noeagerlyassume-warning {{2}} } - clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} + + // The 'combo' note intentionally appears if `assume-at-least-one-iteration` + // is disabled, but also appears as a bug when `eagerly-assume` and + // `assume-at-least-one-iteration` are both enabled. + clang_analyzer_dump(i); // combo-warning {{0}} expected-warning {{1}} expected-warning {{2}} eagerlyassume-warning {{3}} } void calledTwice(int arg, int isFirstCall) { // This function is called twice (with two different unknown 'arg' values) to // check the iteration count handling in this situation. - for (int i = 0; i < arg; i++) { + int i; + for (i = 0; i < arg; i++) { if (isFirstCall) { clang_analyzer_numTimesReached(); // expected-warning {{2}} } else { @@ -215,5 +258,5 @@ void onlyLoopConditions(int arg) { break; } - clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}} + clang_analyzer_dump(i); // expected-warning {{1}} expected-warning {{2}} expected-warning {{3}} expected-warning {{4}} } diff --git a/clang/test/Analysis/out-of-bounds-constraint-check.c b/clang/test/Analysis/out-of-bounds-constraint-check.c index df48c8c170713..f20159da02997 100644 --- a/clang/test/Analysis/out-of-bounds-constraint-check.c +++ b/clang/test/Analysis/out-of-bounds-constraint-check.c @@ -1,112 +1,163 @@ // RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,security.ArrayBound,debug.ExprInspection \ // RUN: -analyzer-config eagerly-assume=false -verify %s -void clang_analyzer_eval(int); -void clang_analyzer_printState(void); - -typedef typeof(sizeof(int)) size_t; -const char a[] = "abcd"; // extent: 5 bytes - -void symbolic_size_t_and_int0(size_t len) { - (void)a[len + 1]; // no-warning - // We infered that the 'len' must be in a specific range to make the previous indexing valid. - // len: [0,3] - clang_analyzer_eval(len <= 3); // expected-warning {{TRUE}} - clang_analyzer_eval(len <= 2); // expected-warning {{UNKNOWN}} -} - -void symbolic_size_t_and_int1(size_t len) { - (void)a[len]; // no-warning - // len: [0,4] - clang_analyzer_eval(len <= 4); // expected-warning {{TRUE}} - clang_analyzer_eval(len <= 3); // expected-warning {{UNKNOWN}} -} - -void symbolic_size_t_and_int2(size_t len) { - (void)a[len - 1]; // no-warning - // len: [1,5] - clang_analyzer_eval(1 <= len && len <= 5); // expected-warning {{TRUE}} - clang_analyzer_eval(2 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 4); // expected-warning {{UNKNOWN}} -} - -void symbolic_uint_and_int0(unsigned len) { - (void)a[len + 1]; // no-warning - // len: [0,3] - clang_analyzer_eval(0 <= len && len <= 3); // expected-warning {{TRUE}} - clang_analyzer_eval(1 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 2); // expected-warning {{UNKNOWN}} -} - -void symbolic_uint_and_int1(unsigned len) { - (void)a[len]; // no-warning - // len: [0,4] - clang_analyzer_eval(0 <= len && len <= 4); // expected-warning {{TRUE}} - clang_analyzer_eval(1 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 3); // expected-warning {{UNKNOWN}} -} -void symbolic_uint_and_int2(unsigned len) { - (void)a[len - 1]; // no-warning - // len: [1,5] - clang_analyzer_eval(1 <= len && len <= 5); // expected-warning {{TRUE}} - clang_analyzer_eval(2 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 4); // expected-warning {{UNKNOWN}} -} - -void symbolic_int_and_int0(int len) { - (void)a[len + 1]; // no-warning - // len: [-1,3] - clang_analyzer_eval(-1 <= len && len <= 3); // expected-warning {{TRUE}} - clang_analyzer_eval(0 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 2); // expected-warning {{UNKNOWN}} -} -void symbolic_int_and_int1(int len) { - (void)a[len]; // no-warning - // len: [0,4] - clang_analyzer_eval(0 <= len && len <= 4); // expected-warning {{TRUE}} - clang_analyzer_eval(1 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 3); // expected-warning {{UNKNOWN}} -} -void symbolic_int_and_int2(int len) { - (void)a[len - 1]; // no-warning - // len: [1,5] - clang_analyzer_eval(1 <= len && len <= 5); // expected-warning {{TRUE}} - clang_analyzer_eval(2 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 4); // expected-warning {{UNKNOWN}} -} - -void symbolic_longlong_and_int0(long long len) { - (void)a[len + 1]; // no-warning - // len: [-1,3] - clang_analyzer_eval(-1 <= len && len <= 3); // expected-warning {{TRUE}} - clang_analyzer_eval(0 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 2); // expected-warning {{UNKNOWN}} +// When the checker security.ArrayBound encounters an array subscript operation +// that _may be_ in bounds, it assumes that indexing _is_ in bound. This test +// file validates these assumptions. + +void clang_analyzer_value(int); + +// Simple case: memory area with a static extent. + +extern int FiveInts[5]; + +void int_plus_one(int len) { + (void)FiveInts[len + 1]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [-1, 3] }}} +} + +void int_neutral(int len) { + (void)FiveInts[len]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 4] }}} +} + +void int_minus_one(int len) { + (void)FiveInts[len - 1]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [1, 5] }}} +} + +void unsigned_plus_one(unsigned len) { + (void)FiveInts[len + 1]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 3] }}} +} + +void unsigned_neutral(unsigned len) { + (void)FiveInts[len]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 4] }}} +} + +void unsigned_minus_one(unsigned len) { + (void)FiveInts[len - 1]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [1, 5] }}} +} + +void ll_plus_one(long long len) { + (void)FiveInts[len + 1]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [-1, 3] }}} +} + +void ll_neutral(long long len) { + (void)FiveInts[len]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 4] }}} +} + +void ll_minus_one(long long len) { + (void)FiveInts[len - 1]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [1, 5] }}} +} + +void ull_plus_one(unsigned long long len) { + (void)FiveInts[len + 1]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 3] }}} +} + +void ull_neutral(unsigned long long len) { + (void)FiveInts[len]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 4] }}} +} + +void ull_minus_one(unsigned long long len) { + (void)FiveInts[len - 1]; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [1, 5] }}} } +// Also try the same with a dynamically allocated memory block, because in the +// past there were issues with the type/signedness of dynamic extent symbols. + +typedef __typeof(sizeof(int)) size_t; void *malloc(size_t); void free(void *); -void symbolic_longlong_and_int0_dynamic_extent(long long len) { - char *b = malloc(5); - (void)b[len + 1]; // no-warning - // len: [-1,3] - clang_analyzer_eval(-1 <= len && len <= 3); // expected-warning {{TRUE}} - clang_analyzer_eval(0 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 2); // expected-warning {{UNKNOWN}} - free(b); -} - -void symbolic_longlong_and_int1(long long len) { - (void)a[len]; // no-warning - // len: [0,4] - clang_analyzer_eval(0 <= len && len <= 4); // expected-warning {{TRUE}} - clang_analyzer_eval(1 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 3); // expected-warning {{UNKNOWN}} -} - -void symbolic_longlong_and_int2(long long len) { - (void)a[len - 1]; // no-warning - // len: [1,5] - clang_analyzer_eval(1 <= len && len <= 5); // expected-warning {{TRUE}} - clang_analyzer_eval(2 <= len); // expected-warning {{UNKNOWN}} - clang_analyzer_eval(len <= 4); // expected-warning {{UNKNOWN}} + +void dyn_int_plus_one(int len) { + char *p = malloc(5); + p[len + 1] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [-1, 3] }}} + free(p); +} + +void dyn_int_neutral(int len) { + char *p = malloc(5); + p[len] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 4] }}} + free(p); +} + +void dyn_int_minus_one(int len) { + char *p = malloc(5); + p[len - 1] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [1, 5] }}} + free(p); +} + +void dyn_unsigned_plus_one(unsigned len) { + char *p = malloc(5); + p[len + 1] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 3] }}} + free(p); +} + +void dyn_unsigned_neutral(unsigned len) { + char *p = malloc(5); + p[len] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 4] }}} + free(p); +} + +void dyn_unsigned_minus_one(unsigned len) { + char *p = malloc(5); + p[len - 1] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [1, 5] }}} + free(p); +} + +void dyn_ll_plus_one(long long len) { + char *p = malloc(5); + p[len + 1] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [-1, 3] }}} + free(p); +} + +void dyn_ll_neutral(long long len) { + char *p = malloc(5); + p[len] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 4] }}} + free(p); +} + +void dyn_ll_minus_one(long long len) { + char *p = malloc(5); + p[len - 1] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [1, 5] }}} + free(p); +} + +void dyn_ull_plus_one(unsigned long long len) { + char *p = malloc(5); + p[len + 1] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 3] }}} + free(p); +} + +void dyn_ull_neutral(unsigned long long len) { + char *p = malloc(5); + p[len] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [0, 4] }}} + free(p); +} + +void dyn_ull_minus_one(unsigned long long len) { + char *p = malloc(5); + p[len - 1] = 1; // no-warning + clang_analyzer_value(len); // expected-warning {{{ [1, 5] }}} + free(p); } diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c index 923797200d0b4..9f410e884d763 100644 --- a/clang/test/Analysis/out-of-bounds.c +++ b/clang/test/Analysis/out-of-bounds.c @@ -1,6 +1,4 @@ -// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,security.ArrayBound,debug.ExprInspection -verify %s - -void clang_analyzer_eval(int); +// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,security.ArrayBound -verify %s // Tests doing an out-of-bounds access after the end of an array using: // - constant integer index @@ -142,12 +140,6 @@ void test4(int x) { buf[x] = 1; // expected-warning{{Out of bound access to memory}} } -void test_assume_after_access(unsigned long x) { - int buf[100]; - buf[x] = 1; - clang_analyzer_eval(x <= 99); // expected-warning{{TRUE}} -} - // Don't warn when indexing below the start of a symbolic region's whose // base extent we don't know. int *get_symbolic(void); @@ -180,12 +172,6 @@ void test_extern_void(void) { p[1] = 42; // no-warning } -void test_assume_after_access2(unsigned long x) { - char buf[100]; - buf[x] = 1; - clang_analyzer_eval(x <= 99); // expected-warning{{TRUE}} -} - struct incomplete; char test_comparison_with_extent_symbol(struct incomplete *p) { // Previously this was reported as a (false positive) overflow error because diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt index e9eb54a67204c..b796a51ef600e 100644 --- a/clang/test/CMakeLists.txt +++ b/clang/test/CMakeLists.txt @@ -103,6 +103,15 @@ if(CLANG_BUILD_EXAMPLES AND CLANG_PLUGIN_SUPPORT) ) endif () +if(LLVM_INCLUDE_SPIRV_TOOLS_TESTS) + list(APPEND CLANG_TEST_DEPS + spirv-dis + spirv-val + spirv-as + spirv-link + ) +endif() + set(CLANG_TEST_PARAMS USE_Z3_SOLVER=0 ) diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c index 0cd81a77f5cc5..c116604288546 100644 --- a/clang/test/CodeGen/allow-ubsan-check.c +++ b/clang/test/CodeGen/allow-ubsan-check.c @@ -86,7 +86,7 @@ int div(int x, int y) { } // CHECK-LABEL: define dso_local i32 @null( -// CHECK-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // @@ -102,7 +102,7 @@ int div(int x, int y) { // CHECK-NEXT: ret i32 [[TMP2]] // // TR-LABEL: define dso_local i32 @null( -// TR-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// TR-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // TR-NEXT: [[ENTRY:.*:]] // TR-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // TR-NEXT: [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]] @@ -116,7 +116,7 @@ int div(int x, int y) { // TR-NEXT: ret i32 [[TMP2]] // // REC-LABEL: define dso_local i32 @null( -// REC-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// REC-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // REC-NEXT: [[ENTRY:.*:]] // REC-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // REC-NEXT: [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]] diff --git a/clang/test/CodeGenCUDA/launch-bounds.cu b/clang/test/CodeGenCUDA/launch-bounds.cu index 31ca9216b413e..72f7857264f8c 100644 --- a/clang/test/CodeGenCUDA/launch-bounds.cu +++ b/clang/test/CodeGenCUDA/launch-bounds.cu @@ -9,6 +9,25 @@ #define MAX_BLOCKS_PER_MP 4 #endif +// CHECK: @Kernel1() #[[ATTR0:[0-9]+]] +// CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]] +// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]] +// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]] +// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]] + +// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}} +// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}} +// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}} + +// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]] +// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]] +// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]] +// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]] + +// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}} +// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}} +// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}} + // Test both max threads per block and Min cta per sm. extern "C" { __global__ void @@ -19,7 +38,6 @@ Kernel1() } // CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256} -// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"minctasm", i32 2} #ifdef USE_MAX_BLOCKS // Test max threads per block and min/max cta per sm. @@ -32,8 +50,6 @@ Kernel1_sm_90() } // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256} -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"minctasm", i32 2} -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxclusterrank", i32 4} #endif // USE_MAX_BLOCKS // Test only max threads per block. Min cta per sm defaults to 0, and @@ -67,7 +83,6 @@ Kernel4() template __global__ void Kernel4(); // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256} -// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"minctasm", i32 2} #ifdef USE_MAX_BLOCKS template @@ -79,8 +94,6 @@ Kernel4_sm_90() template __global__ void Kernel4_sm_90(); // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256} -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"minctasm", i32 2} -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxclusterrank", i32 4} #endif //USE_MAX_BLOCKS const int constint = 100; @@ -94,7 +107,6 @@ Kernel5() template __global__ void Kernel5(); // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356} -// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"minctasm", i32 258} #ifdef USE_MAX_BLOCKS @@ -109,8 +121,6 @@ Kernel5_sm_90() template __global__ void Kernel5_sm_90(); // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356} -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"minctasm", i32 258} -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxclusterrank", i32 260} #endif //USE_MAX_BLOCKS // Make sure we don't emit negative launch bounds values. @@ -120,7 +130,6 @@ Kernel6() { } // CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx", -// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"minctasm", __global__ void __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP ) @@ -144,12 +153,9 @@ Kernel7_sm_90() const char constchar = 12; __global__ void __launch_bounds__(constint, constchar) Kernel8() {} // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100 -// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"minctasm", i32 12 #ifdef USE_MAX_BLOCKS const char constchar_2 = 14; __global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {} // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100 -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"minctasm", i32 12 -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxclusterrank", i32 14 #endif // USE_MAX_BLOCKS diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp index 83daf57be22ff..3662a270713b6 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp @@ -3,7 +3,7 @@ // RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -o - -emit-llvm | FileCheck %s -// CHECK: define{{.*}} ptr @_Z6upcastP1B(ptr noundef readnone returned %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z6upcastP1B(ptr noundef readnone returned captures(ret: address, provenance) %b) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: ret ptr %b // CHECK-NEXT: } @@ -22,12 +22,12 @@ // CHECK: declare ptr @__dynamic_cast(ptr, ptr, ptr, i64) local_unnamed_addr -// CHECK: define{{.*}} ptr @_Z8selfcastP1B(ptr noundef readnone returned %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z8selfcastP1B(ptr noundef readnone returned captures(ret: address, provenance) %b) local_unnamed_addr // CHECK-NEXT: entry // CHECK-NEXT: ret ptr %b // CHECK-NEXT: } -// CHECK: define{{.*}} ptr @_Z9void_castP1B(ptr noundef readonly %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z9void_castP1B(ptr noundef readonly captures(address_is_null, ret: address, provenance) %b) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: [[isnull:%[0-9]+]] = icmp eq ptr %b, null // CHECK-NEXT: br i1 [[isnull]], label %[[dynamic_cast_end:[a-z0-9._]+]], label %[[dynamic_cast_notnull:[a-z0-9._]+]] diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp index c471e5dbd7b33..2a838708ca231 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp @@ -24,7 +24,7 @@ // CHECK-NEXT: ret ptr @_ZTS1A // CHECK-NEXT: } -// CHECK: define{{.*}} i1 @_Z5equalP1A(ptr noundef readonly %a) local_unnamed_addr +// CHECK: define{{.*}} i1 @_Z5equalP1A(ptr noundef readonly captures(address_is_null) %a) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: [[isnull:%[0-9]+]] = icmp eq ptr %a, null // CHECK-NEXT: br i1 [[isnull]], label %[[bad_typeid:[a-z0-9._]+]], label %[[end:[a-z0-9.+]+]] diff --git a/clang/test/CodeGenHLSL/disable_opt.hlsl b/clang/test/CodeGenHLSL/disable_opt.hlsl deleted file mode 100644 index bfffe76cfa9de..0000000000000 --- a/clang/test/CodeGenHLSL/disable_opt.hlsl +++ /dev/null @@ -1,12 +0,0 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -O0 -emit-llvm -xhlsl -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -O3 -emit-llvm -xhlsl -o - %s | FileCheck %s --check-prefix=OPT - -// CHECK:!"dx.disable_optimizations", i32 1} - -// OPT-NOT:"dx.disable_optimizations" - -float bar(float a, float b); - -float foo(float a, float b) { - return bar(a, b); -} diff --git a/clang/test/CodeGenHLSL/inline-functions.hlsl b/clang/test/CodeGenHLSL/inline-functions.hlsl index e78d04ec9594f..4748eeee7475f 100644 --- a/clang/test/CodeGenHLSL/inline-functions.hlsl +++ b/clang/test/CodeGenHLSL/inline-functions.hlsl @@ -1,9 +1,9 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECK,NOINLINE -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -O0 -o - | FileCheck %s --check-prefixes=CHECK,INLINE -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,INLINE +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECK,NOINLINE,OPT_ATTR +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -O0 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,OPT_ATTR +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,NOOPT_ATTR // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECK,NOINLINE -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -O0 -o - | FileCheck %s --check-prefixes=CHECK,INLINE -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,INLINE +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -O0 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,OPT_ATTR +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute %s -emit-llvm -O1 -o - | FileCheck %s --check-prefixes=CHECK,INLINE,NOOPT_ATTR // Tests that user functions will always be inlined. // This includes exported functions and mangled entry point implementation functions. @@ -71,7 +71,8 @@ RWBuffer Indices; // NOINLINE: ret void // The unmangled version is not inlined, EntryAttr reflects that -// CHECK: Function Attrs: {{.*}}noinline +// OPT_ATTR: Function Attrs: {{.*}}optnone +// NOOPT_ATTR-NOT: Function Attrs: {{.*}}optnone // CHECK: define void @main() {{[a-z_ ]*}}[[EntryAttr:\#[0-9]+]] // Make sure function calls are inlined when AlwaysInline is run // This only leaves calls to llvm. intrinsics @@ -98,7 +99,8 @@ void main(unsigned int GI : SV_GroupIndex) { // NOINLINE: ret void // The unmangled version is not inlined, EntryAttr reflects that -// CHECK: Function Attrs: {{.*}}noinline +// OPT_ATTR: Function Attrs: {{.*}}optnone +// NOOPT_ATTR-NOT: Function Attrs: {{.*}}optnone // CHECK: define void @main10() {{[a-z_ ]*}}[[EntryAttr]] // Make sure function calls are inlined when AlwaysInline is run // This only leaves calls to llvm. intrinsics diff --git a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl index 0aadaad2dca5c..62fd20c4d1414 100644 --- a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl +++ b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl @@ -22,7 +22,7 @@ __amdgpu_buffer_rsrc_t getBuffer(void *p) { } // CHECK-LABEL: define {{[^@]+}}@consumeBufferPtr -// CHECK-SAME: (ptr addrspace(5) noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: (ptr addrspace(5) noundef readonly captures(address) [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq ptr addrspace(5) [[P]], addrspacecast (ptr null to ptr addrspace(5)) // CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] @@ -39,7 +39,7 @@ void consumeBufferPtr(__amdgpu_buffer_rsrc_t *p) { } // CHECK-LABEL: define {{[^@]+}}@test -// CHECK-SAME: (ptr addrspace(5) noundef readonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: (ptr addrspace(5) noundef readonly captures(address) [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A]], align 16, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 diff --git a/clang/test/CodeGenOpenCL/as_type.cl b/clang/test/CodeGenOpenCL/as_type.cl index 1fe26fbeafdb4..2c6cdc3810b4d 100644 --- a/clang/test/CodeGenOpenCL/as_type.cl +++ b/clang/test/CodeGenOpenCL/as_type.cl @@ -67,7 +67,7 @@ int3 f8(char16 x) { return __builtin_astype(x, int3); } -//CHECK: define{{.*}} spir_func noundef ptr addrspace(1) @addr_cast(ptr noundef readnone %[[x:.*]]) +//CHECK: define{{.*}} spir_func noundef ptr addrspace(1) @addr_cast(ptr noundef readnone captures(ret: address, provenance) %[[x:.*]]) //CHECK: %[[cast:.*]] ={{.*}} addrspacecast ptr %[[x]] to ptr addrspace(1) //CHECK: ret ptr addrspace(1) %[[cast]] global int* addr_cast(int *x) { diff --git a/clang/test/Driver/amdgpu-openmp-sanitize-options.c b/clang/test/Driver/amdgpu-openmp-sanitize-options.c index c28a758bfc0c5..f6a8a7dc57ccc 100644 --- a/clang/test/Driver/amdgpu-openmp-sanitize-options.c +++ b/clang/test/Driver/amdgpu-openmp-sanitize-options.c @@ -1,11 +1,11 @@ // REQUIRES: x86-registered-target, amdgpu-registered-target // Fail on invalid ROCm Path. -// RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fgpu-sanitize -nogpuinc --rocm-path=%S/Inputs/rocm-invalid %s 2>&1 \ +// RUN: not %clang -no-canonical-prefixes -### -mcode-object-version=5 --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fgpu-sanitize -nogpuinc --rocm-path=%S/Inputs/rocm-invalid %s 2>&1 \ // RUN: | FileCheck --check-prefix=FAIL %s // Enable multiple sanitizer's apart from ASan with invalid rocm-path. -// RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fsanitize=leak -fgpu-sanitize --rocm-path=%S/Inputs/rocm-invalid -nogpuinc %s 2>&1 \ +// RUN: not %clang -no-canonical-prefixes -### -mcode-object-version=5 --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fsanitize=leak -fgpu-sanitize --rocm-path=%S/Inputs/rocm-invalid -nogpuinc %s 2>&1 \ // RUN: | FileCheck --check-prefixes=NOTSUPPORTED,FAIL %s // Memory, Leak, UndefinedBehaviour and Thread Sanitizer are not supported on AMDGPU. @@ -13,38 +13,40 @@ // RUN: | FileCheck --check-prefix=NOTSUPPORTED %s // GPU ASan Enabled Test Cases -// ASan enabled for amdgpu-arch [gfx908] -// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908 -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=NOXNACK,GPUSAN %s - -// GPU ASan enabled for amdgpu-arch [gfx908:xnack-] -// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack- -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=XNACKNEG,GPUSAN %s // GPU ASan enabled for amdgpu-arch [gfx908:xnack+] // RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=GPUSAN %s +// RUN: | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s + +// GPU ASan enabled through '-fsanitize=address' flag without '-fgpu-sanitize' for amdgpu-arch [gfx908:xnack+] +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s // ASan enabled for multiple amdgpu-arch [gfx908:xnack+,gfx900:xnack+] // RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack+ -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=GPUSAN %s +// RUN: | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s // GPU ASan Disabled Test Cases -// ASan disabled for amdgpu-arch [gfx908] -// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908 -fsanitize=address -fno-gpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=NOGPUSAN %s + +// GPU ASan disabled through '-fsanitize=address' without '-fgpu-sanitize' flag for amdgpu-arch [gfx908] +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908 -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=NOXNACK,HOSTSAN,NOGPUSAN,SAN %s + +// GPU ASan disabled for amdgpu-arch [gfx908] +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908 -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=NOXNACK,HOSTSAN,NOGPUSAN,SAN %s // GPU ASan disabled for amdgpu-arch [gfx908:xnack-] -// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack- -fsanitize=address -fno-gpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=NOGPUSAN %s +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack- -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=XNACKNEG,HOSTSAN,NOGPUSAN,SAN %s -// GPU ASan disabled for amdgpu-arch [gfx908:xnack+] +// GPU ASan disabled using '-fno-gpu-sanitize' for amdgpu-arch [gfx908:xnack+] // RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fno-gpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=NOGPUSAN %s +// RUN: | FileCheck -check-prefixes=HOSTSAN,NOGPUSAN,SAN %s -// ASan disabled for amdgpu-arch [gfx908:xnack+,gfx900:xnack+] +// GPU ASan disabled for multiple amdgpu-arch [gfx908:xnack+,gfx900:xnack+] // RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack+ -fsanitize=address -fno-gpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=NOGPUSAN %s +// RUN: | FileCheck -check-prefixes=HOSTSAN,NOGPUSAN,SAN %s // FAIL-DAG: error: cannot find ROCm device library for ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library // NOTSUPPORTED-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' @@ -52,14 +54,11 @@ // NOXNACK: warning: ignoring '-fsanitize=address' option for offload arch 'gfx908' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead // XNACKNEG: warning: ignoring '-fsanitize=address' option for offload arch 'gfx908:xnack-' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead -// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} -// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}} -// GPUSAN: {{"[^"]*clang-offload-packager[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=gfx908(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}} -// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}} -// GPUSAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}} +// HOSTSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} -// NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} +// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-mlink-bitcode-file" "[^"]*asanrtl.bc".* "-mlink-bitcode-file" "[^"]*ockl.bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}} // NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-x" "c".*}} -// NOGPUSAN: {{"[^"]*clang-offload-packager[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=gfx908(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}} -// NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}} -// NOGPUSAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}} + +// SAN: {{"[^"]*clang-offload-packager[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=gfx908(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}} +// SAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}} +// SAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}} diff --git a/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip b/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip index 79a52f0bc8981..f17e56acfb7f7 100644 --- a/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip +++ b/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip @@ -1,6 +1,5 @@ -// RUN: %clang -### -nogpuinc -nogpulib --offload-arch=gfx1030 --offload-arch=gfx1100 --offload-arch=gfx1101 -MD -MF tmp.d %s 2>&1 | FileCheck %s +// RUN: %clang -### -nogpuinc -nogpulib --offload-arch=gfx1030 --offload-arch=gfx1100 --offload-arch=gfx1101 --target=x86_64-linux-gnu -MD -MF tmp.d %s 2>&1 | FileCheck %s -// CHECK: Build config: // CHECK-NOT: {{.*}}clang{{.*}}"-target-cpu" "gfx1030"{{.*}}"-dependency-file" "tmp.d" // CHECK: {{.*}}lld{{.*}}"-plugin-opt=mcpu=gfx1030" // CHECK-NOT: {{.*}}clang{{.*}}"-target-cpu" "gfx1100"{{.*}}"-dependency-file" "tmp.d" diff --git a/clang/test/Driver/hip-sanitize-options.hip b/clang/test/Driver/hip-sanitize-options.hip index 8a852867f5b3b..8de0ee9e18426 100644 --- a/clang/test/Driver/hip-sanitize-options.hip +++ b/clang/test/Driver/hip-sanitize-options.hip @@ -1,5 +1,5 @@ // RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ -// RUN: -fsanitize=address -fgpu-sanitize \ +// RUN: -fsanitize=address \ // RUN: -nogpuinc --rocm-path=%S/Inputs/rocm \ // RUN: %s 2>&1 | FileCheck -check-prefixes=NORDC %s diff --git a/clang/test/Driver/sparc-ias-Wa.s b/clang/test/Driver/sparc-ias-Wa.s new file mode 100644 index 0000000000000..79456c02935be --- /dev/null +++ b/clang/test/Driver/sparc-ias-Wa.s @@ -0,0 +1,60 @@ +// RUN: %clang --target=sparc-linux-gnu -### -fintegrated-as -c %s -Wa,-Av8 2>&1 | \ +// RUN: FileCheck -check-prefix=V8 %s +// V8: -cc1as +// V8: "-target-feature" "-v8plus" + +// RUN: %clang --target=sparc-linux-gnu -### -fintegrated-as -c %s -Wa,-Av8plus 2>&1 | \ +// RUN: FileCheck -check-prefix=V8PLUS %s +// V8PLUS: -cc1as +// V8PLUS: "-target-feature" "+v8plus" +// V8PLUS: "-target-feature" "+v9" + +// RUN: %clang --target=sparc-linux-gnu -### -fintegrated-as -c %s -Wa,-Av8plusa 2>&1 | \ +// RUN: FileCheck -check-prefix=V8PLUSA %s +// V8PLUSA: -cc1as +// V8PLUSA: "-target-feature" "+v8plus" +// V8PLUSA: "-target-feature" "+v9" +// V8PLUSA: "-target-feature" "+vis" + +// RUN: %clang --target=sparc-linux-gnu -### -fintegrated-as -c %s -Wa,-Av8plusb 2>&1 | \ +// RUN: FileCheck -check-prefix=V8PLUSB %s +// V8PLUSB: -cc1as +// V8PLUSB: "-target-feature" "+v8plus" +// V8PLUSB: "-target-feature" "+v9" +// V8PLUSB: "-target-feature" "+vis" +// V8PLUSB: "-target-feature" "+vis2" + +// RUN: %clang --target=sparc-linux-gnu -### -fintegrated-as -c %s -Wa,-Av8plusd 2>&1 | \ +// RUN: FileCheck -check-prefix=V8PLUSD %s +// V8PLUSD: -cc1as +// V8PLUSD: "-target-feature" "+v8plus" +// V8PLUSD: "-target-feature" "+v9" +// V8PLUSD: "-target-feature" "+vis" +// V8PLUSD: "-target-feature" "+vis2" +// V8PLUSD: "-target-feature" "+vis3" + +// RUN: %clang --target=sparc-linux-gnu -### -fintegrated-as -c %s -Wa,-Av9 2>&1 | \ +// RUN: FileCheck -check-prefix=V9 %s +// V9: -cc1as +// V9: "-target-feature" "+v9" + +// RUN: %clang --target=sparc-linux-gnu -### -fintegrated-as -c %s -Wa,-Av9a 2>&1 | \ +// RUN: FileCheck -check-prefix=V9A %s +// V9A: -cc1as +// V9A: "-target-feature" "+v9" +// V9A: "-target-feature" "+vis" + +// RUN: %clang --target=sparc-linux-gnu -### -fintegrated-as -c %s -Wa,-Av9b 2>&1 | \ +// RUN: FileCheck -check-prefix=V9B %s +// V9B: -cc1as +// V9B: "-target-feature" "+v9" +// V9B: "-target-feature" "+vis" +// V9B: "-target-feature" "+vis2" + +// RUN: %clang --target=sparc-linux-gnu -### -fintegrated-as -c %s -Wa,-Av9d 2>&1 | \ +// RUN: FileCheck -check-prefix=V9D %s +// V9D: -cc1as +// V9D: "-target-feature" "+v9" +// V9D: "-target-feature" "+vis" +// V9D: "-target-feature" "+vis2" +// V9D: "-target-feature" "+vis3" diff --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp index 67285cfaef34d..5cd95f1927e5c 100644 --- a/clang/test/OpenMP/ordered_codegen.cpp +++ b/clang/test/OpenMP/ordered_codegen.cpp @@ -572,30 +572,30 @@ void foo_simd(int low, int up) { // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IV]], align 4 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]] -// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK1-NEXT: [[ADD6:%.*]] = add i32 [[TMP9]], 1 // CHECK1-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TMP8]], [[ADD6]] // CHECK1-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK1-NEXT: [[MUL:%.*]] = mul i32 [[TMP11]], 1 // CHECK1-NEXT: [[ADD8:%.*]] = add i32 [[TMP10]], [[MUL]] -// CHECK1-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4 // CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM]] -// CHECK1-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-NEXT: call void @__captured_stmt(ptr [[I5]]), !llvm.access.group [[ACC_GRP3]] +// CHECK1-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +// CHECK1-NEXT: call void @__captured_stmt(ptr [[I5]]) // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK1-NEXT: [[ADD9:%.*]] = add i32 [[TMP13]], 1 -// CHECK1-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] +// CHECK1-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK1: omp.inner.for.end: // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -645,31 +645,31 @@ void foo_simd(int low, int up) { // CHECK1-NEXT: store i32 [[TMP27]], ptr [[DOTOMP_IV16]], align 4 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND29:%.*]] // CHECK1: omp.inner.for.cond29: -// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]] -// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK1-NEXT: [[ADD30:%.*]] = add i32 [[TMP29]], 1 // CHECK1-NEXT: [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]] // CHECK1-NEXT: br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]] // CHECK1: omp.inner.for.body32: -// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK1-NEXT: [[MUL33:%.*]] = mul i32 [[TMP31]], 1 // CHECK1-NEXT: [[ADD34:%.*]] = add i32 [[TMP30]], [[MUL33]] -// CHECK1-NEXT: store i32 [[ADD34]], ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-NEXT: store i32 [[ADD34]], ptr [[I28]], align 4 +// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[I28]], align 4 // CHECK1-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP32]] to i64 // CHECK1-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM35]] -// CHECK1-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX36]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-NEXT: call void @__captured_stmt.1(ptr [[I28]]), !llvm.access.group [[ACC_GRP7]] +// CHECK1-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX36]], align 4 +// CHECK1-NEXT: call void @__captured_stmt.1(ptr [[I28]]) // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE37:%.*]] // CHECK1: omp.body.continue37: // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC38:%.*]] // CHECK1: omp.inner.for.inc38: -// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK1-NEXT: [[ADD39:%.*]] = add i32 [[TMP33]], 1 -// CHECK1-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group [[ACC_GRP7]] -// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK1-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_IV16]], align 4 +// CHECK1-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK1: omp.inner.for.end40: // CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]] // CHECK1: omp.dispatch.inc: @@ -1201,32 +1201,32 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: store i32 0, ptr [[DOTOMP_IV]], align 4 // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK1-IRBUILDER: omp.inner.for.cond: -// CHECK1-IRBUILDER-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]] -// CHECK1-IRBUILDER-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-IRBUILDER-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK1-IRBUILDER-NEXT: [[ADD6:%.*]] = add i32 [[TMP8]], 1 // CHECK1-IRBUILDER-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TMP7]], [[ADD6]] // CHECK1-IRBUILDER-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body: -// CHECK1-IRBUILDER-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-IRBUILDER-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-IRBUILDER-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK1-IRBUILDER-NEXT: [[MUL:%.*]] = mul i32 [[TMP10]], 1 // CHECK1-IRBUILDER-NEXT: [[ADD8:%.*]] = add i32 [[TMP9]], [[MUL]] -// CHECK1-IRBUILDER-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-IRBUILDER-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-IRBUILDER-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4 // CHECK1-IRBUILDER-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64 // CHECK1-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM]] -// CHECK1-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-IRBUILDER-NEXT: call void @__captured_stmt(ptr [[I5]]), !llvm.access.group [[ACC_GRP3]] +// CHECK1-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +// CHECK1-IRBUILDER-NEXT: call void @__captured_stmt(ptr [[I5]]) // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body.ordered.after: // CHECK1-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1-IRBUILDER: omp.body.continue: // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1-IRBUILDER: omp.inner.for.inc: -// CHECK1-IRBUILDER-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK1-IRBUILDER-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK1-IRBUILDER-NEXT: [[ADD9:%.*]] = add i32 [[TMP12]], 1 -// CHECK1-IRBUILDER-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] +// CHECK1-IRBUILDER-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK1-IRBUILDER: omp.inner.for.end: // CHECK1-IRBUILDER-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK1-IRBUILDER-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -1278,34 +1278,34 @@ void foo_simd(int low, int up) { // CHECK1-IRBUILDER-NEXT: store i32 [[TMP26]], ptr [[DOTOMP_IV16]], align 4 // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30:%.*]] // CHECK1-IRBUILDER: omp.inner.for.cond30: -// CHECK1-IRBUILDER-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]] -// CHECK1-IRBUILDER-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK1-IRBUILDER-NEXT: [[ADD31:%.*]] = add i32 [[TMP28]], 1 // CHECK1-IRBUILDER-NEXT: [[CMP32:%.*]] = icmp ult i32 [[TMP27]], [[ADD31]] // CHECK1-IRBUILDER-NEXT: br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END42:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body33: -// CHECK1-IRBUILDER-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-IRBUILDER-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK1-IRBUILDER-NEXT: [[MUL34:%.*]] = mul i32 [[TMP30]], 1 // CHECK1-IRBUILDER-NEXT: [[ADD35:%.*]] = add i32 [[TMP29]], [[MUL34]] -// CHECK1-IRBUILDER-NEXT: store i32 [[ADD35]], ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-IRBUILDER-NEXT: [[TMP31:%.*]] = load i32, ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: store i32 [[ADD35]], ptr [[I28]], align 4 +// CHECK1-IRBUILDER-NEXT: [[TMP31:%.*]] = load i32, ptr [[I28]], align 4 // CHECK1-IRBUILDER-NEXT: [[IDXPROM36:%.*]] = sext i32 [[TMP31]] to i64 // CHECK1-IRBUILDER-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM36]] -// CHECK1-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX37]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK1-IRBUILDER-NEXT: call void @__captured_stmt.1(ptr [[I28]]), !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX37]], align 4 +// CHECK1-IRBUILDER-NEXT: call void @__captured_stmt.1(ptr [[I28]]) // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]] // CHECK1-IRBUILDER: omp.inner.for.body33.ordered.after: // CHECK1-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE38:%.*]] // CHECK1-IRBUILDER: omp.body.continue38: // CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC39:%.*]] // CHECK1-IRBUILDER: omp.inner.for.inc39: -// CHECK1-IRBUILDER-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK1-IRBUILDER-NEXT: [[ADD40:%.*]] = add i32 [[TMP32]], 1 -// CHECK1-IRBUILDER-NEXT: store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK1-IRBUILDER-NEXT: store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4 // CHECK1-IRBUILDER-NEXT: [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12]]) -// CHECK1-IRBUILDER-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]), !llvm.access.group [[ACC_GRP7]] -// CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK1-IRBUILDER-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]) +// CHECK1-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK1-IRBUILDER: omp.inner.for.end42: // CHECK1-IRBUILDER-NEXT: br label [[OMP_DISPATCH_INC:%.*]] // CHECK1-IRBUILDER: omp.dispatch.inc: @@ -1812,30 +1812,30 @@ void foo_simd(int low, int up) { // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3: omp.inner.for.cond: -// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]] -// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK3-NEXT: [[ADD6:%.*]] = add i32 [[TMP9]], 1 // CHECK3-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TMP8]], [[ADD6]] // CHECK3-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK3: omp.inner.for.body: -// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: [[MUL:%.*]] = mul i32 [[TMP11]], 1 // CHECK3-NEXT: [[ADD8:%.*]] = add i32 [[TMP10]], [[MUL]] -// CHECK3-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4 // CHECK3-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM]] -// CHECK3-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-NEXT: call void @__captured_stmt(ptr [[I5]]), !llvm.access.group [[ACC_GRP3]] +// CHECK3-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +// CHECK3-NEXT: call void @__captured_stmt(ptr [[I5]]) // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3: omp.body.continue: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: -// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK3-NEXT: [[ADD9:%.*]] = add i32 [[TMP13]], 1 -// CHECK3-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] +// CHECK3-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK3: omp.inner.for.end: // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -1885,31 +1885,31 @@ void foo_simd(int low, int up) { // CHECK3-NEXT: store i32 [[TMP27]], ptr [[DOTOMP_IV16]], align 4 // CHECK3-NEXT: br label [[OMP_INNER_FOR_COND29:%.*]] // CHECK3: omp.inner.for.cond29: -// CHECK3-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]] -// CHECK3-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK3-NEXT: [[ADD30:%.*]] = add i32 [[TMP29]], 1 // CHECK3-NEXT: [[CMP31:%.*]] = icmp ult i32 [[TMP28]], [[ADD30]] // CHECK3-NEXT: br i1 [[CMP31]], label [[OMP_INNER_FOR_BODY32:%.*]], label [[OMP_INNER_FOR_END40:%.*]] // CHECK3: omp.inner.for.body32: -// CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK3-NEXT: [[MUL33:%.*]] = mul i32 [[TMP31]], 1 // CHECK3-NEXT: [[ADD34:%.*]] = add i32 [[TMP30]], [[MUL33]] -// CHECK3-NEXT: store i32 [[ADD34]], ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-NEXT: [[TMP32:%.*]] = load i32, ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-NEXT: store i32 [[ADD34]], ptr [[I28]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = load i32, ptr [[I28]], align 4 // CHECK3-NEXT: [[IDXPROM35:%.*]] = sext i32 [[TMP32]] to i64 // CHECK3-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM35]] -// CHECK3-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX36]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-NEXT: call void @__captured_stmt.1(ptr [[I28]]), !llvm.access.group [[ACC_GRP7]] +// CHECK3-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX36]], align 4 +// CHECK3-NEXT: call void @__captured_stmt.1(ptr [[I28]]) // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE37:%.*]] // CHECK3: omp.body.continue37: // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC38:%.*]] // CHECK3: omp.inner.for.inc38: -// CHECK3-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK3-NEXT: [[ADD39:%.*]] = add i32 [[TMP33]], 1 -// CHECK3-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[TMP0]]), !llvm.access.group [[ACC_GRP7]] -// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK3-NEXT: store i32 [[ADD39]], ptr [[DOTOMP_IV16]], align 4 +// CHECK3-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND29]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK3: omp.inner.for.end40: // CHECK3-NEXT: br label [[OMP_DISPATCH_INC:%.*]] // CHECK3: omp.dispatch.inc: @@ -2441,32 +2441,32 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: store i32 0, ptr [[DOTOMP_IV]], align 4 // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK3-IRBUILDER: omp.inner.for.cond: -// CHECK3-IRBUILDER-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]] -// CHECK3-IRBUILDER-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-IRBUILDER-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK3-IRBUILDER-NEXT: [[ADD6:%.*]] = add i32 [[TMP8]], 1 // CHECK3-IRBUILDER-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TMP7]], [[ADD6]] // CHECK3-IRBUILDER-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body: -// CHECK3-IRBUILDER-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-IRBUILDER-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-IRBUILDER-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK3-IRBUILDER-NEXT: [[MUL:%.*]] = mul i32 [[TMP10]], 1 // CHECK3-IRBUILDER-NEXT: [[ADD8:%.*]] = add i32 [[TMP9]], [[MUL]] -// CHECK3-IRBUILDER-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-IRBUILDER-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-IRBUILDER-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4 // CHECK3-IRBUILDER-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64 // CHECK3-IRBUILDER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM]] -// CHECK3-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-IRBUILDER-NEXT: call void @__captured_stmt(ptr [[I5]]), !llvm.access.group [[ACC_GRP3]] +// CHECK3-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +// CHECK3-IRBUILDER-NEXT: call void @__captured_stmt(ptr [[I5]]) // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body.ordered.after: // CHECK3-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK3-IRBUILDER: omp.body.continue: // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3-IRBUILDER: omp.inner.for.inc: -// CHECK3-IRBUILDER-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] +// CHECK3-IRBUILDER-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK3-IRBUILDER-NEXT: [[ADD9:%.*]] = add i32 [[TMP12]], 1 -// CHECK3-IRBUILDER-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]] -// CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]] +// CHECK3-IRBUILDER-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4 +// CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] // CHECK3-IRBUILDER: omp.inner.for.end: // CHECK3-IRBUILDER-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK3-IRBUILDER-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -2518,34 +2518,34 @@ void foo_simd(int low, int up) { // CHECK3-IRBUILDER-NEXT: store i32 [[TMP26]], ptr [[DOTOMP_IV16]], align 4 // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30:%.*]] // CHECK3-IRBUILDER: omp.inner.for.cond30: -// CHECK3-IRBUILDER-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]] -// CHECK3-IRBUILDER-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK3-IRBUILDER-NEXT: [[ADD31:%.*]] = add i32 [[TMP28]], 1 // CHECK3-IRBUILDER-NEXT: [[CMP32:%.*]] = icmp ult i32 [[TMP27]], [[ADD31]] // CHECK3-IRBUILDER-NEXT: br i1 [[CMP32]], label [[OMP_INNER_FOR_BODY33:%.*]], label [[OMP_INNER_FOR_END42:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body33: -// CHECK3-IRBUILDER-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-IRBUILDER-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_18]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK3-IRBUILDER-NEXT: [[MUL34:%.*]] = mul i32 [[TMP30]], 1 // CHECK3-IRBUILDER-NEXT: [[ADD35:%.*]] = add i32 [[TMP29]], [[MUL34]] -// CHECK3-IRBUILDER-NEXT: store i32 [[ADD35]], ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-IRBUILDER-NEXT: [[TMP31:%.*]] = load i32, ptr [[I28]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: store i32 [[ADD35]], ptr [[I28]], align 4 +// CHECK3-IRBUILDER-NEXT: [[TMP31:%.*]] = load i32, ptr [[I28]], align 4 // CHECK3-IRBUILDER-NEXT: [[IDXPROM36:%.*]] = sext i32 [[TMP31]] to i64 // CHECK3-IRBUILDER-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM36]] -// CHECK3-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX37]], align 4, !llvm.access.group [[ACC_GRP7]] -// CHECK3-IRBUILDER-NEXT: call void @__captured_stmt.1(ptr [[I28]]), !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX37]], align 4 +// CHECK3-IRBUILDER-NEXT: call void @__captured_stmt.1(ptr [[I28]]) // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_BODY33_ORDERED_AFTER:%.*]] // CHECK3-IRBUILDER: omp.inner.for.body33.ordered.after: // CHECK3-IRBUILDER-NEXT: br label [[OMP_BODY_CONTINUE38:%.*]] // CHECK3-IRBUILDER: omp.body.continue38: // CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_INC39:%.*]] // CHECK3-IRBUILDER: omp.inner.for.inc39: -// CHECK3-IRBUILDER-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV16]], align 4 // CHECK3-IRBUILDER-NEXT: [[ADD40:%.*]] = add i32 [[TMP32]], 1 -// CHECK3-IRBUILDER-NEXT: store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4, !llvm.access.group [[ACC_GRP7]] +// CHECK3-IRBUILDER-NEXT: store i32 [[ADD40]], ptr [[DOTOMP_IV16]], align 4 // CHECK3-IRBUILDER-NEXT: [[OMP_GLOBAL_THREAD_NUM41:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB12]]) -// CHECK3-IRBUILDER-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]), !llvm.access.group [[ACC_GRP7]] -// CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP8:![0-9]+]] +// CHECK3-IRBUILDER-NEXT: call void @__kmpc_dispatch_fini_4u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM41]]) +// CHECK3-IRBUILDER-NEXT: br label [[OMP_INNER_FOR_COND30]], !llvm.loop [[LOOP5:![0-9]+]] // CHECK3-IRBUILDER: omp.inner.for.end42: // CHECK3-IRBUILDER-NEXT: br label [[OMP_DISPATCH_INC:%.*]] // CHECK3-IRBUILDER: omp.dispatch.inc: @@ -2885,33 +2885,33 @@ void foo_simd(int low, int up) { // CHECK5-NEXT: store i32 0, ptr [[DOTOMP_IV]], align 4 // CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK5: omp.inner.for.cond: -// CHECK5-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]] -// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK5-NEXT: [[ADD6:%.*]] = add i32 [[TMP8]], 1 // CHECK5-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TMP7]], [[ADD6]] // CHECK5-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK5: omp.inner.for.body: -// CHECK5-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK5-NEXT: [[MUL:%.*]] = mul i32 [[TMP10]], 1 // CHECK5-NEXT: [[ADD8:%.*]] = add i32 [[TMP9]], [[MUL]] -// CHECK5-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: store i32 [[ADD8]], ptr [[I5]], align 4 +// CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[I5]], align 4 // CHECK5-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64 // CHECK5-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM]] -// CHECK5-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK5-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX]], align 4 +// CHECK5-NEXT: [[TMP12:%.*]] = load i32, ptr [[I5]], align 4 // CHECK5-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP12]] to i64 // CHECK5-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM9]] -// CHECK5-NEXT: store float 1.000000e+00, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: store float 1.000000e+00, ptr [[ARRAYIDX10]], align 4 // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK5: omp.body.continue: // CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK5: omp.inner.for.inc: -// CHECK5-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]] +// CHECK5-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK5-NEXT: [[ADD11:%.*]] = add i32 [[TMP13]], 1 -// CHECK5-NEXT: store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]] -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]] +// CHECK5-NEXT: store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]] // CHECK5: omp.inner.for.end: // CHECK5-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK5-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 @@ -2951,33 +2951,33 @@ void foo_simd(int low, int up) { // CHECK5-NEXT: store i32 [[TMP25]], ptr [[DOTOMP_IV30]], align 4 // CHECK5-NEXT: br label [[OMP_INNER_FOR_COND32:%.*]] // CHECK5: omp.inner.for.cond32: -// CHECK5-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]] -// CHECK5-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4 +// CHECK5-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK5-NEXT: [[ADD33:%.*]] = add i32 [[TMP27]], 1 // CHECK5-NEXT: [[CMP34:%.*]] = icmp ult i32 [[TMP26]], [[ADD33]] // CHECK5-NEXT: br i1 [[CMP34]], label [[OMP_INNER_FOR_BODY35:%.*]], label [[OMP_INNER_FOR_END45:%.*]] // CHECK5: omp.inner.for.body35: -// CHECK5-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4, !llvm.access.group [[ACC_GRP13]] -// CHECK5-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4 +// CHECK5-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4 // CHECK5-NEXT: [[MUL36:%.*]] = mul i32 [[TMP29]], 1 // CHECK5-NEXT: [[ADD37:%.*]] = add i32 [[TMP28]], [[MUL36]] -// CHECK5-NEXT: store i32 [[ADD37]], ptr [[I31]], align 4, !llvm.access.group [[ACC_GRP13]] -// CHECK5-NEXT: [[TMP30:%.*]] = load i32, ptr [[I31]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: store i32 [[ADD37]], ptr [[I31]], align 4 +// CHECK5-NEXT: [[TMP30:%.*]] = load i32, ptr [[I31]], align 4 // CHECK5-NEXT: [[IDXPROM38:%.*]] = sext i32 [[TMP30]] to i64 // CHECK5-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM38]] -// CHECK5-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX39]], align 4, !llvm.access.group [[ACC_GRP13]] -// CHECK5-NEXT: [[TMP31:%.*]] = load i32, ptr [[I31]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX39]], align 4 +// CHECK5-NEXT: [[TMP31:%.*]] = load i32, ptr [[I31]], align 4 // CHECK5-NEXT: [[IDXPROM40:%.*]] = sext i32 [[TMP31]] to i64 // CHECK5-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [10 x float], ptr @f, i64 0, i64 [[IDXPROM40]] -// CHECK5-NEXT: store float 1.000000e+00, ptr [[ARRAYIDX41]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: store float 1.000000e+00, ptr [[ARRAYIDX41]], align 4 // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE42:%.*]] // CHECK5: omp.body.continue42: // CHECK5-NEXT: br label [[OMP_INNER_FOR_INC43:%.*]] // CHECK5: omp.inner.for.inc43: -// CHECK5-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK5-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV30]], align 4 // CHECK5-NEXT: [[ADD44:%.*]] = add i32 [[TMP32]], 1 -// CHECK5-NEXT: store i32 [[ADD44]], ptr [[DOTOMP_IV30]], align 4, !llvm.access.group [[ACC_GRP13]] -// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND32]], !llvm.loop [[LOOP14:![0-9]+]] +// CHECK5-NEXT: store i32 [[ADD44]], ptr [[DOTOMP_IV30]], align 4 +// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND32]], !llvm.loop [[LOOP11:![0-9]+]] // CHECK5: omp.inner.for.end45: // CHECK5-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4 // CHECK5-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_20]], align 4 diff --git a/clang/test/OpenMP/spirv_variant_match.cpp b/clang/test/OpenMP/spirv_variant_match.cpp new file mode 100644 index 0000000000000..b37858bc3008b --- /dev/null +++ b/clang/test/OpenMP/spirv_variant_match.cpp @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-host.bc -DDEVICE +// RUN: %clang_cc1 -verify -triple spirv64-intel -aux-triple x86_64-unknown-unknown -fopenmp -fopenmp-is-target-device \ +// RUN:-fopenmp-host-ir-file-path %t-host.bc -nogpulib %s -emit-llvm -DDEVICE -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-host.bc -DTARGET +// RUN: %clang_cc1 -verify -triple spirv64-intel -aux-triple x86_64-unknown-unknown -fopenmp -fopenmp-is-target-device \ +// RUN: -fopenmp-host-ir-file-path %t-host.bc -nogpulib %s -emit-llvm -DTARGET -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-host.bc -DTARGET_KIND +// RUN: %clang_cc1 -verify -triple spirv64-intel -aux-triple x86_64-unknown-unknown -fopenmp -fopenmp-is-target-device \ +// RUN: -fopenmp-host-ir-file-path %t-host.bc -nogpulib %s -emit-llvm -DTARGET_KIND -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-host.bc +// RUN: %clang_cc1 -verify -triple spirv64-intel -aux-triple x86_64-unknown-unknown -fopenmp -fopenmp-is-target-device \ +// RUN: -fopenmp-host-ir-file-path %t-host.bc -nogpulib %s -emit-llvm -o - | FileCheck %s + +// expected-no-diagnostics + +#pragma omp declare target +int foo() { return 0; } + +#ifdef DEVICE +#pragma omp begin declare variant match(device = {arch(spirv64)}) +#elif defined(TARGET) +#pragma omp begin declare variant match(target_device = {arch(spirv64)}) +#elif defined(TARGET_KIND) +#pragma omp begin declare variant match(target_device = {kind(gpu)}) +#else +#pragma omp begin declare variant match(device = {kind(gpu)}) +#endif + +int foo() { return 1; } +#pragma omp end declare variant +#pragma omp end declare target + +// CHECK-DAG: define{{.*}} @{{"_Z[0-9]+foo\$ompvariant\$.*"}}() + +// CHECK-DAG: call spir_func noundef i32 @{{"_Z[0-9]+foo\$ompvariant\$.*"}}() + +int main() { + int res; +#pragma omp target map(from \ + : res) + res = foo(); + return res; +} diff --git a/clang/test/Preprocessor/embed_preprocess_to_file.c b/clang/test/Preprocessor/embed_preprocess_to_file.c index 9895d958cf96d..b3c99d36f784a 100644 --- a/clang/test/Preprocessor/embed_preprocess_to_file.c +++ b/clang/test/Preprocessor/embed_preprocess_to_file.c @@ -37,3 +37,11 @@ const char even_more[] = { // DIRECTIVE-NEXT: #embed prefix(4, 5,) suffix(, 6, 7) /* clang -E -dE */ // DIRECTIVE-NEXT: , 8, 9, 10 // DIRECTIVE-NEXT: }; + +constexpr char big_one[] = { +#embed +}; + +// EXPANDED: constexpr char big_one[] = {255 +// DIRECTIVE: constexpr char big_one[] = { +// DIRECTIVE-NEXT: #embed diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp index 37dca2215af6b..832ce15e66250 100644 --- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp +++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp @@ -525,3 +525,17 @@ template ::Array array> void test() {} void foo() { test<{1, 2, 3}>(); } } // namespace GH113518 + +namespace GH125821 { +template +struct A { A(T){} }; + +template +using Proxy = T; + +template +using C = Proxy< A >; + +C test{ 42 }; // expected-error {{no viable constructor or deduction guide for deduction of template arguments}} + +} // namespace GH125821 diff --git a/clang/test/SemaHLSL/use-cxx-alt-operator-names.hlsl b/clang/test/SemaHLSL/use-cxx-alt-operator-names.hlsl new file mode 100644 index 0000000000000..e93be2bbf4e69 --- /dev/null +++ b/clang/test/SemaHLSL/use-cxx-alt-operator-names.hlsl @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -ast-dump | FileCheck %s + +// CHECK: -FunctionDecl {{.*}} and 'void ()' +void and() {} + +// CHECK: -FunctionDecl {{.*}} and_eq 'void ()' +void and_eq() {} + +// CHECK: -FunctionDecl {{.*}} bitand 'void ()' +void bitand() {} + +// CHECK: -FunctionDecl {{.*}} bitor 'void ()' +void bitor() {} + +// CHECK: -FunctionDecl {{.*}} compl 'void ()' +void compl() {} + +// CHECK: -FunctionDecl {{.*}} not 'void ()' +void not() {} + +// CHECK: -FunctionDecl {{.*}} not_eq 'void ()' +void not_eq() {} + +// CHECK: -FunctionDecl {{.*}} or 'void ()' +void or() {} + +// CHECK: -FunctionDecl {{.*}} or_eq 'void ()' +void or_eq() {} + +// CHECK: -FunctionDecl {{.*}} xor 'void ()' +void xor() {} + +// CHECK: -FunctionDecl {{.*}} xor_eq 'void ()' +void xor_eq() {} diff --git a/clang/test/Tooling/clang-linker-wrapper-spirv-elf.cpp b/clang/test/Tooling/clang-linker-wrapper-spirv-elf.cpp index 4f8658064e857..9b16727d74192 100644 --- a/clang/test/Tooling/clang-linker-wrapper-spirv-elf.cpp +++ b/clang/test/Tooling/clang-linker-wrapper-spirv-elf.cpp @@ -1,6 +1,7 @@ // Verify the ELF packaging of OpenMP SPIR-V device images. // REQUIRES: system-linux // REQUIRES: spirv-tools +// REQUIRES: llvm-spirv // RUN: mkdir -p %t_tmp // RUN: cd %t_tmp // RUN: %clangxx -fopenmp -fopenmp-targets=spirv64-intel -nogpulib -c -o %t_clang-linker-wrapper-spirv-elf.o %s diff --git a/clang/test/Tooling/lit.local.cfg b/clang/test/Tooling/lit.local.cfg index bc2a096c8f64f..61f328c91e4d3 100644 --- a/clang/test/Tooling/lit.local.cfg +++ b/clang/test/Tooling/lit.local.cfg @@ -1,3 +1,5 @@ +import lit.util + if not config.root.clang_staticanalyzer: config.unsupported = True @@ -6,3 +8,7 @@ if config.spirv_tools_tests: config.substitutions.append(("spirv-dis", os.path.join(config.llvm_tools_dir, "spirv-dis"))) config.substitutions.append(("spirv-val", os.path.join(config.llvm_tools_dir, "spirv-val"))) config.substitutions.append(("spirv-as", os.path.join(config.llvm_tools_dir, "spirv-as"))) + config.substitutions.append(("spirv-link", os.path.join(config.llvm_tools_dir, "spirv-link"))) + +if lit.util.which("llvm-spirv"): + config.available_features.add("llvm-spirv") diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp index 8a2c3463189fa..0dddcfce58ca6 100644 --- a/clang/tools/clang-installapi/Options.cpp +++ b/clang/tools/clang-installapi/Options.cpp @@ -263,11 +263,12 @@ bool Options::processInstallAPIXOptions(InputArgList &Args) { } const StringRef ASpelling = NextA->getSpelling(); const auto &AValues = NextA->getValues(); + auto &UniqueArgs = FEOpts.UniqueArgs[Label]; if (AValues.empty()) - FEOpts.UniqueArgs[Label].emplace_back(ASpelling.str()); + UniqueArgs.emplace_back(ASpelling.str()); else for (const StringRef Val : AValues) - FEOpts.UniqueArgs[Label].emplace_back((ASpelling + Val).str()); + UniqueArgs.emplace_back((ASpelling + Val).str()); A->claim(); NextA->claim(); @@ -608,32 +609,37 @@ Options::processAndFilterOutInstallAPIOptions(ArrayRef Args) { ParsedArgs.hasArg(OPT_not_for_dyld_shared_cache); for (const Arg *A : ParsedArgs.filtered(OPT_allowable_client)) { + auto It = ArgToArchMap.find(A); LinkerOpts.AllowableClients[A->getValue()] = - ArgToArchMap.count(A) ? ArgToArchMap[A] : ArchitectureSet(); + It != ArgToArchMap.end() ? It->second : ArchitectureSet(); A->claim(); } for (const Arg *A : ParsedArgs.filtered(OPT_reexport_l)) { + auto It = ArgToArchMap.find(A); LinkerOpts.ReexportedLibraries[A->getValue()] = - ArgToArchMap.count(A) ? ArgToArchMap[A] : ArchitectureSet(); + It != ArgToArchMap.end() ? It->second : ArchitectureSet(); A->claim(); } for (const Arg *A : ParsedArgs.filtered(OPT_reexport_library)) { + auto It = ArgToArchMap.find(A); LinkerOpts.ReexportedLibraryPaths[A->getValue()] = - ArgToArchMap.count(A) ? ArgToArchMap[A] : ArchitectureSet(); + It != ArgToArchMap.end() ? It->second : ArchitectureSet(); A->claim(); } for (const Arg *A : ParsedArgs.filtered(OPT_reexport_framework)) { + auto It = ArgToArchMap.find(A); LinkerOpts.ReexportedFrameworks[A->getValue()] = - ArgToArchMap.count(A) ? ArgToArchMap[A] : ArchitectureSet(); + It != ArgToArchMap.end() ? It->second : ArchitectureSet(); A->claim(); } for (const Arg *A : ParsedArgs.filtered(OPT_rpath)) { + auto It = ArgToArchMap.find(A); LinkerOpts.RPaths[A->getValue()] = - ArgToArchMap.count(A) ? ArgToArchMap[A] : ArchitectureSet(); + It != ArgToArchMap.end() ? It->second : ArchitectureSet(); A->claim(); } diff --git a/clang/tools/libclang/CXString.cpp b/clang/tools/libclang/CXString.cpp index 5e427957a1092..aaa8f8eeb67a1 100644 --- a/clang/tools/libclang/CXString.cpp +++ b/clang/tools/libclang/CXString.cpp @@ -87,19 +87,7 @@ CXString createRef(StringRef String) { if (String.empty()) return createEmpty(); - // If the string is not nul-terminated, we have to make a copy. - - // FIXME: This is doing a one past end read, and should be removed! For memory - // we don't manage, the API string can become unterminated at any time outside - // our control. - - if (String.data()[String.size()] != 0) - return createDup(String); - - CXString Result; - Result.data = String.data(); - Result.private_flags = (unsigned) CXS_Unmanaged; - return Result; + return createDup(String); } CXString createDup(StringRef String) { diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 54d8ff0571ca6..5ab0867490122 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -1187,8 +1187,8 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresClausesAndConcepts) { EXPECT_TOKEN(Tokens[33], tok::identifier, TT_Unknown); Tokens = - annotate("template\n" - "requires (C1 && (C21 || C22 && C2e) && C3)\n" + annotate("template \n" + " requires(C1 && (C21 || C22 && C2e) && C3)\n" "struct Foo;"); ASSERT_EQ(Tokens.size(), 38u) << Tokens; EXPECT_TOKEN(Tokens[5], tok::kw_requires, TT_RequiresClause); diff --git a/clang/unittests/Sema/HeuristicResolverTest.cpp b/clang/unittests/Sema/HeuristicResolverTest.cpp index 5c3459dbeb101..c7cfe7917c532 100644 --- a/clang/unittests/Sema/HeuristicResolverTest.cpp +++ b/clang/unittests/Sema/HeuristicResolverTest.cpp @@ -394,6 +394,22 @@ TEST(HeuristicResolver, MemberExpr_DeducedNonTypeTemplateParameter) { fieldDecl(hasName("found")).bind("output")); } +TEST(HeuristicResolver, MemberExpr_HangIssue126536) { + std::string Code = R"cpp( + template + void foo() { + T bar; + auto baz = (bar, bar); + baz.foo(); + } + )cpp"; + // Test resolution of "foo" in "baz.foo()". + // Here, we are testing that we do not get into an infinite loop. + expectResolution( + Code, &HeuristicResolver::resolveMemberExpr, + cxxDependentScopeMemberExpr(hasMemberName("foo")).bind("input")); +} + TEST(HeuristicResolver, DeclRefExpr_StaticMethod) { std::string Code = R"cpp( template diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index 7f0c0c194dc91..eaa4ab52c243d 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -304,6 +304,17 @@ int __llvm_profile_get_padding_sizes_for_counters( */ void __llvm_profile_set_dumped(void); +/*! + * \brief Write custom target-specific profiling data to a seperate file. + * Used by offload PGO. + */ +int __llvm_write_custom_profile(const char *Target, + const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, + const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, + const char *NamesEnd); + /*! * This variable is defined in InstrProfilingRuntime.cpp as a hidden * symbol. Its main purpose is to enable profile runtime user to diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 343063fd6b754..503d159fd9817 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -541,6 +541,17 @@ static FILE *getFileObject(const char *OutputName) { return fopen(OutputName, "ab"); } +static void closeFileObject(FILE *OutputFile) { + if (OutputFile == getProfileFile()) { + fflush(OutputFile); + if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) { + lprofUnlockFileHandle(OutputFile); + } + } else { + fclose(OutputFile); + } +} + /* Write profile data to file \c OutputName. */ static int writeFile(const char *OutputName) { int RetVal; @@ -562,15 +573,7 @@ static int writeFile(const char *OutputName) { initFileWriter(&fileWriter, OutputFile); RetVal = lprofWriteData(&fileWriter, lprofGetVPDataReader(), MergeDone); - if (OutputFile == getProfileFile()) { - fflush(OutputFile); - if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) { - lprofUnlockFileHandle(OutputFile); - } - } else { - fclose(OutputFile); - } - + closeFileObject(OutputFile); return RetVal; } @@ -1359,4 +1362,107 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File, return 0; } +int __llvm_write_custom_profile(const char *Target, + const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, + const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, + const char *NamesEnd) { + int ReturnValue = 0, FilenameLength, TargetLength; + char *FilenameBuf, *TargetFilename; + const char *Filename; + + /* Save old profile data */ + FILE *oldFile = getProfileFile(); + + // Temporarily suspend getting SIGKILL when the parent exits. + int PDeathSig = lprofSuspendSigKill(); + + if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) { + PROF_NOTE("Profile data not written to file: %s.\n", "already written"); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return 0; + } + + /* Check if there is llvm/runtime version mismatch. */ + if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) { + PROF_ERR("Runtime and instrumentation version mismatch : " + "expected %d, but get %d\n", + INSTR_PROF_RAW_VERSION, + (int)GET_VERSION(__llvm_profile_get_version())); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return -1; + } + + /* Get current filename */ + FilenameLength = getCurFilenameLength(); + FilenameBuf = (char *)COMPILER_RT_ALLOCA(FilenameLength + 1); + Filename = getCurFilename(FilenameBuf, 0); + + /* Check the filename. */ + if (!Filename) { + PROF_ERR("Failed to write file : %s\n", "Filename not set"); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return -1; + } + + /* Allocate new space for our target-specific PGO filename */ + TargetLength = strlen(Target); + TargetFilename = + (char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2); + + /* Find file basename and path sizes */ + int32_t DirEnd = FilenameLength - 1; + while (DirEnd >= 0 && !IS_DIR_SEPARATOR(Filename[DirEnd])) { + DirEnd--; + } + uint32_t DirSize = DirEnd + 1, BaseSize = FilenameLength - DirSize; + + /* Prepend "TARGET." to current filename */ + if (DirSize > 0) { + memcpy(TargetFilename, Filename, DirSize); + } + memcpy(TargetFilename + DirSize, Target, TargetLength); + TargetFilename[TargetLength + DirSize] = '.'; + memcpy(TargetFilename + DirSize + 1 + TargetLength, Filename + DirSize, + BaseSize); + TargetFilename[FilenameLength + 1 + TargetLength] = 0; + + /* Open and truncate target-specific PGO file */ + FILE *OutputFile = fopen(TargetFilename, "w"); + setProfileFile(OutputFile); + + if (!OutputFile) { + PROF_ERR("Failed to open file : %s\n", TargetFilename); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return -1; + } + + FreeHook = &free; + setupIOBuffer(); + + /* Write custom data */ + ProfDataWriter fileWriter; + initFileWriter(&fileWriter, OutputFile); + + /* Write custom data to the file */ + ReturnValue = lprofWriteDataImpl( + &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL, + lprofGetVPDataReader(), NULL, NULL, NULL, NULL, NamesBegin, NamesEnd, 0); + closeFileObject(OutputFile); + + // Restore SIGKILL. + if (PDeathSig == 1) + lprofRestoreSigKill(); + + /* Restore old profiling file */ + setProfileFile(oldFile); + + return ReturnValue; +} + #endif diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_local_cache.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_local_cache.h index e495c56f03775..6e54c4852fbb6 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_local_cache.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_local_cache.h @@ -166,7 +166,7 @@ struct SizeClassAllocator32LocalCache { DCHECK_GT(c->count, 0); } void *res = c->batch[--c->count]; - PREFETCH(c->batch[c->count - 1]); + PREFETCH(c->batch[c->count > 0 ? c->count - 1 : 0]); stats_.Add(AllocatorStatAllocated, c->class_size); return res; } diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt index 2ee8138350876..15068887267a0 100644 --- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt +++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/global_symbols.txt @@ -103,6 +103,7 @@ fputwc U free U freelocale U fwrite U +getauxval U getc U getcwd U getenv U diff --git a/compiler-rt/lib/scudo/standalone/chunk.h b/compiler-rt/lib/scudo/standalone/chunk.h index 9228df0471890..a1b8e723d4cb5 100644 --- a/compiler-rt/lib/scudo/standalone/chunk.h +++ b/compiler-rt/lib/scudo/standalone/chunk.h @@ -125,7 +125,7 @@ inline void loadHeader(u32 Cookie, const void *Ptr, *NewUnpackedHeader = bit_cast(NewPackedHeader); if (UNLIKELY(NewUnpackedHeader->Checksum != computeHeaderChecksum(Cookie, Ptr, NewUnpackedHeader))) - reportHeaderCorruption(const_cast(Ptr)); + reportHeaderCorruption(NewUnpackedHeader, const_cast(Ptr)); } inline bool isValid(u32 Cookie, const void *Ptr, diff --git a/compiler-rt/lib/scudo/standalone/report.cpp b/compiler-rt/lib/scudo/standalone/report.cpp index 9cef0adc0bb31..14a4066d37200 100644 --- a/compiler-rt/lib/scudo/standalone/report.cpp +++ b/compiler-rt/lib/scudo/standalone/report.cpp @@ -9,6 +9,7 @@ #include "report.h" #include "atomic_helpers.h" +#include "chunk.h" #include "string_utils.h" #include @@ -65,9 +66,18 @@ void NORETURN reportInvalidFlag(const char *FlagType, const char *Value) { // The checksum of a chunk header is invalid. This could be caused by an // {over,under}write of the header, a pointer that is not an actual chunk. -void NORETURN reportHeaderCorruption(void *Ptr) { - ScopedErrorReport Report; - Report.append("corrupted chunk header at address %p\n", Ptr); +void NORETURN reportHeaderCorruption(void *Header, void *Ptr) { + ScopedErrorReport Report; + Report.append("corrupted chunk header at address %p", Ptr); + if (*static_cast(Header) == 0U) { + // Header all zero, which could indicate that this might be a pointer that + // has been double freed but the memory has been released to the kernel. + Report.append(": chunk header is zero and might indicate memory corruption " + "or a double free\n", + Ptr); + } else { + Report.append(": most likely due to memory corruption\n", Ptr); + } } // The allocator was compiled with parameters that conflict with field size diff --git a/compiler-rt/lib/scudo/standalone/report.h b/compiler-rt/lib/scudo/standalone/report.h index a510fdaebb6de..c0214b51560e9 100644 --- a/compiler-rt/lib/scudo/standalone/report.h +++ b/compiler-rt/lib/scudo/standalone/report.h @@ -12,7 +12,6 @@ #include "internal_defs.h" namespace scudo { - // Reports are *fatal* unless stated otherwise. // Generic error, adds newline to end of message. @@ -25,7 +24,7 @@ void NORETURN reportRawError(const char *Message); void NORETURN reportInvalidFlag(const char *FlagType, const char *Value); // Chunk header related errors. -void NORETURN reportHeaderCorruption(void *Ptr); +void NORETURN reportHeaderCorruption(void *Header, void *Ptr); // Sanity checks related error. void NORETURN reportSanityCheckError(const char *Field); diff --git a/compiler-rt/lib/scudo/standalone/tests/report_test.cpp b/compiler-rt/lib/scudo/standalone/tests/report_test.cpp index 6c46243053d9e..514837df1a43a 100644 --- a/compiler-rt/lib/scudo/standalone/tests/report_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/report_test.cpp @@ -8,6 +8,7 @@ #include "tests/scudo_unit_test.h" +#include "chunk.h" #include "report.h" TEST(ScudoReportDeathTest, Check) { @@ -20,9 +21,11 @@ TEST(ScudoReportDeathTest, Check) { TEST(ScudoReportDeathTest, Generic) { // Potentially unused if EXPECT_DEATH isn't defined. UNUSED void *P = reinterpret_cast(0x42424242U); + UNUSED scudo::Chunk::PackedHeader Header = {}; EXPECT_DEATH(scudo::reportError("TEST123"), "Scudo ERROR.*TEST123"); EXPECT_DEATH(scudo::reportInvalidFlag("ABC", "DEF"), "Scudo ERROR.*ABC.*DEF"); - EXPECT_DEATH(scudo::reportHeaderCorruption(P), "Scudo ERROR.*42424242"); + EXPECT_DEATH(scudo::reportHeaderCorruption(&Header, P), + "Scudo ERROR.*42424242"); EXPECT_DEATH(scudo::reportSanityCheckError("XYZ"), "Scudo ERROR.*XYZ"); EXPECT_DEATH(scudo::reportAlignmentTooBig(123, 456), "Scudo ERROR.*123.*456"); EXPECT_DEATH(scudo::reportAllocationSizeTooBig(123, 456, 789), @@ -54,6 +57,19 @@ TEST(ScudoReportDeathTest, CSpecific) { "Scudo ERROR.*123.*456"); } +TEST(ScudoReportDeathTest, HeaderCorruption) { + UNUSED void *P = reinterpret_cast(0x42424242U); + UNUSED scudo::Chunk::PackedHeader Header = {}; + EXPECT_DEATH(scudo::reportHeaderCorruption(&Header, P), + "Scudo ERROR.*corrupted chunk header at address 0x.*42424242: " + "chunk header is zero and might indicate memory " + "corruption or a double free"); + Header = 10U; + EXPECT_DEATH(scudo::reportHeaderCorruption(&Header, P), + "Scudo ERROR.*corrupted chunk header at address 0x.*42424242: " + "most likely due to memory corruption"); +} + #if SCUDO_LINUX || SCUDO_TRUSTY || SCUDO_ANDROID #include "report_linux.h" diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h index e19fcde8d0e64..1e637895d8e99 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRType.h +++ b/flang/include/flang/Optimizer/Dialect/FIRType.h @@ -276,6 +276,13 @@ inline mlir::Type unwrapPassByRefType(mlir::Type t) { return t; } +/// Extracts the innermost type, T, **potentially** wrapped inside: +/// >> +/// +/// Any element absent from the above pattern does not affect the returned +/// value: T. +mlir::Type getFortranElementType(mlir::Type ty); + /// Unwrap either a sequence or a boxed sequence type, returning the element /// type of the sequence type. /// e.g., diff --git a/flang/include/flang/Optimizer/OpenACC/FIROpenACCTypeInterfaces.h b/flang/include/flang/Optimizer/OpenACC/FIROpenACCTypeInterfaces.h index c1bea32a22361..3e343f347e4ae 100644 --- a/flang/include/flang/Optimizer/OpenACC/FIROpenACCTypeInterfaces.h +++ b/flang/include/flang/Optimizer/OpenACC/FIROpenACCTypeInterfaces.h @@ -18,6 +18,19 @@ namespace fir::acc { +template +struct OpenACCPointerLikeModel + : public mlir::acc::PointerLikeType::ExternalModel< + OpenACCPointerLikeModel, T> { + mlir::Type getElementType(mlir::Type pointer) const { + return mlir::cast(pointer).getElementType(); + } + mlir::acc::VariableTypeCategory + getPointeeTypeCategory(mlir::Type pointer, + mlir::TypedValue varPtr, + mlir::Type varType) const; +}; + template struct OpenACCMappableModel : public mlir::acc::MappableType::ExternalModel, @@ -36,6 +49,9 @@ struct OpenACCMappableModel llvm::SmallVector generateAccBounds(mlir::Type type, mlir::Value var, mlir::OpBuilder &builder) const; + + mlir::acc::VariableTypeCategory getTypeCategory(mlir::Type type, + mlir::Value var) const; }; } // namespace fir::acc diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index 61f8b0835c958..0b6e0119c16c3 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -383,30 +383,33 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> { freeing up an additional register in numerous functions. However, this approach can make debugging unfeasible on certain machines. }]; - let options = [ - Option<"framePointerKind", "frame-pointer", - "mlir::LLVM::framePointerKind::FramePointerKind", - /*default=*/"mlir::LLVM::framePointerKind::FramePointerKind{}", - "frame pointer">, - Option<"noInfsFPMath", "no-infs-fp-math", - "bool", /*default=*/"false", - "Set the no-infs-fp-math attribute on functions in the module.">, - Option<"noNaNsFPMath", "no-nans-fp-math", - "bool", /*default=*/"false", - "Set the no-nans-fp-math attribute on functions in the module.">, - Option<"approxFuncFPMath", "approx-func-fp-math", - "bool", /*default=*/"false", + let options = + [Option<"framePointerKind", "frame-pointer", + "mlir::LLVM::framePointerKind::FramePointerKind", + /*default=*/"mlir::LLVM::framePointerKind::FramePointerKind{}", + "frame pointer", [{::llvm::cl::values( + clEnumValN(mlir::LLVM::framePointerKind::FramePointerKind::None, "None", ""), + clEnumValN(mlir::LLVM::framePointerKind::FramePointerKind::NonLeaf, "NonLeaf", ""), + clEnumValN(mlir::LLVM::framePointerKind::FramePointerKind::All, "All", ""), + clEnumValN(mlir::LLVM::framePointerKind::FramePointerKind::Reserved, "Reserved", "") + )}]>, + Option<"noInfsFPMath", "no-infs-fp-math", "bool", /*default=*/"false", + "Set the no-infs-fp-math attribute on functions in the module.">, + Option<"noNaNsFPMath", "no-nans-fp-math", "bool", /*default=*/"false", + "Set the no-nans-fp-math attribute on functions in the module.">, + Option< + "approxFuncFPMath", "approx-func-fp-math", "bool", + /*default=*/"false", "Set the approx-func-fp-math attribute on functions in the module.">, - Option<"noSignedZerosFPMath", "no-signed-zeros-fp-math", - "bool", /*default=*/"false", - "Set the no-signed-zeros-fp-math attribute on functions in the module.">, - Option<"unsafeFPMath", "unsafe-fp-math", - "bool", /*default=*/"false", - "Set the unsafe-fp-math attribute on functions in the module.">, - Option<"tuneCPU", "tune-cpu", - "llvm::StringRef", /*default=*/"llvm::StringRef{}", - "Set the tune-cpu attribute on functions in the module.">, -]; + Option<"noSignedZerosFPMath", "no-signed-zeros-fp-math", "bool", + /*default=*/"false", + "Set the no-signed-zeros-fp-math attribute on functions in the " + "module.">, + Option<"unsafeFPMath", "unsafe-fp-math", "bool", /*default=*/"false", + "Set the unsafe-fp-math attribute on functions in the module.">, + Option<"tuneCPU", "tune-cpu", "std::string", /*default=*/"", + "Set the tune-cpu attribute on functions in the module.">, + ]; } def AssumedRankOpConversion : Pass<"fir-assumed-rank-op", "mlir::ModuleOp"> { diff --git a/flang/include/flang/Semantics/openmp-directive-sets.h b/flang/include/flang/Semantics/openmp-directive-sets.h index 7cdca1214e749..dd610c9702c28 100644 --- a/flang/include/flang/Semantics/openmp-directive-sets.h +++ b/flang/include/flang/Semantics/openmp-directive-sets.h @@ -21,6 +21,8 @@ namespace llvm::omp { //===----------------------------------------------------------------------===// // - topSet: The directive appears alone or as the first in a // compound construct. +// - bottomSet: The directive appears alone or as the last in a +// compound construct. // - allSet: All standalone or compound uses of the directive. static const OmpDirectiveSet topDistributeSet{ @@ -172,6 +174,11 @@ static const OmpDirectiveSet topTeamsSet{ Directive::OMPD_teams_loop, }; +static const OmpDirectiveSet bottomTeamsSet{ + Directive::OMPD_target_teams, + Directive::OMPD_teams, +}; + static const OmpDirectiveSet allTeamsSet{ OmpDirectiveSet{ Directive::OMPD_target_teams, diff --git a/flang/include/flang/Tools/PointerModels.h b/flang/include/flang/Tools/PointerModels.h index c3c0977d6e54a..0d22ed3ca7f4f 100644 --- a/flang/include/flang/Tools/PointerModels.h +++ b/flang/include/flang/Tools/PointerModels.h @@ -9,7 +9,6 @@ #ifndef FORTRAN_TOOLS_POINTER_MODELS_H #define FORTRAN_TOOLS_POINTER_MODELS_H -#include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" /// models for FIR pointer like types that already provide a `getElementType` @@ -24,13 +23,4 @@ struct OpenMPPointerLikeModel } }; -template -struct OpenACCPointerLikeModel - : public mlir::acc::PointerLikeType::ExternalModel< - OpenACCPointerLikeModel, T> { - mlir::Type getElementType(mlir::Type pointer) const { - return mlir::cast(pointer).getElementType(); - } -}; - #endif // FORTRAN_TOOLS_POINTER_MODELS_H diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index b7674bd093f68..622848eac2dd2 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -261,12 +261,12 @@ bool CodeGenAction::beginSourceFileAction() { } // Load the MLIR dialects required by Flang - mlir::DialectRegistry registry; - mlirCtx = std::make_unique(registry); - fir::support::registerNonCodegenDialects(registry); - fir::support::loadNonCodegenDialects(*mlirCtx); + mlirCtx = std::make_unique(); fir::support::loadDialects(*mlirCtx); fir::support::registerLLVMTranslation(*mlirCtx); + mlir::DialectRegistry registry; + fir::acc::registerOpenACCExtensions(registry); + mlirCtx->appendDialectRegistry(registry); const llvm::TargetMachine &targetMachine = ci.getTargetMachine(); diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index a11a2c824bf9e..3dd35ed9ae481 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -1803,6 +1803,7 @@ static void privatizeIv(Fortran::lower::AbstractConverter &converter, builder, recipeName, loc, ivValue.getType()); std::stringstream asFortran; + asFortran << Fortran::lower::mangle::demangleName(toStringRef(sym.name())); auto op = createDataEntryOp( builder, loc, ivValue, asFortran, {}, true, /*implicit=*/true, mlir::acc::DataClause::acc_private, ivValue.getType(), diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp index 951293b133677..22cd0679050db 100644 --- a/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp +++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp @@ -197,12 +197,11 @@ static void getLengthParameters(fir::FirOpBuilder &builder, mlir::Location loc, // The verifier for EmboxOp doesn't allow length parameters when the the // character already has static LEN. genLengthParameters may still return them // in this case. - mlir::Type unwrappedType = - fir::unwrapRefType(fir::unwrapSeqOrBoxedSeqType(moldArg.getType())); - if (auto strTy = mlir::dyn_cast(unwrappedType)) { - if (strTy.hasConstantLen()) - lenParams.resize(0); - } + auto strTy = mlir::dyn_cast( + fir::getFortranElementType(moldArg.getType())); + + if (strTy && strTy.hasConstantLen()) + lenParams.resize(0); } static bool diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index 01f3a0326db21..5827d1c3c529e 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -51,7 +51,7 @@ static bool hasGlobalOpTargetAttr(mlir::Value v, fir::AddrOfOp op) { v, fir::GlobalOp::getTargetAttrName(globalOpName)); } -mlir::Value getOriginalDef(mlir::Value v) { +static mlir::Value getOriginalDef(mlir::Value v) { mlir::Operation *defOp; bool breakFromLoop = false; while (!breakFromLoop && (defOp = v.getDefiningOp())) { @@ -578,16 +578,6 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, breakFromLoop = true; }) .Case([&](auto op) { - // If the load is from a leaf source, return the leaf. Do not track - // through indirections otherwise. - // TODO: Add support to fir.alloca and fir.allocmem - auto def = getOriginalDef(op.getMemref()); - if (isDummyArgument(def) || - def.template getDefiningOp()) { - v = def; - defOp = v.getDefiningOp(); - return; - } // If load is inside target and it points to mapped item, // continue tracking. Operation *loadMemrefOp = op.getMemref().getDefiningOp(); @@ -600,6 +590,40 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, defOp = v.getDefiningOp(); return; } + + // If we are loading a box reference, but following the data, + // we gather the attributes of the box to populate the source + // and stop tracking. + if (auto boxTy = mlir::dyn_cast(ty); + boxTy && followingData) { + + if (mlir::isa(boxTy.getEleTy())) + attributes.set(Attribute::Pointer); + + auto def = getOriginalDef(op.getMemref()); + if (auto addrOfOp = def.template getDefiningOp()) { + global = addrOfOp.getSymbol(); + + if (hasGlobalOpTargetAttr(def, addrOfOp)) + attributes.set(Attribute::Target); + + type = SourceKind::Global; + } + + // TODO: Add support to fir.alloca and fir.allocmem + // if (auto allocOp = def.template getDefiningOp()) { + // ... + // } + + if (isDummyArgument(def)) { + defOp = nullptr; + v = def; + } + + breakFromLoop = true; + return; + } + // No further tracking for addresses loaded from memory for now. type = SourceKind::Indirect; breakFromLoop = true; diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index cb4eb8303a495..c76b7cde55bdd 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -589,10 +589,42 @@ struct CallOpConversion : public fir::FIROpConversion { // Convert arith::FastMathFlagsAttr to LLVM::FastMathFlagsAttr. mlir::arith::AttrConvertFastMathToLLVM attrConvert(call); - rewriter.replaceOpWithNewOp( + auto llvmCall = rewriter.replaceOpWithNewOp( call, resultTys, adaptor.getOperands(), addLLVMOpBundleAttrs(rewriter, attrConvert.getAttrs(), adaptor.getOperands().size())); + if (mlir::ArrayAttr argAttrsArray = call.getArgAttrsAttr()) { + // sret and byval type needs to be converted. + auto convertTypeAttr = [&](const mlir::NamedAttribute &attr) { + return mlir::TypeAttr::get(convertType( + llvm::cast(attr.getValue()).getValue())); + }; + llvm::SmallVector newArgAttrsArray; + for (auto argAttrs : argAttrsArray) { + llvm::SmallVector convertedAttrs; + for (const mlir::NamedAttribute &attr : + llvm::cast(argAttrs)) { + if (attr.getName().getValue() == + mlir::LLVM::LLVMDialect::getByValAttrName()) { + convertedAttrs.push_back(rewriter.getNamedAttr( + mlir::LLVM::LLVMDialect::getByValAttrName(), + convertTypeAttr(attr))); + } else if (attr.getName().getValue() == + mlir::LLVM::LLVMDialect::getStructRetAttrName()) { + convertedAttrs.push_back(rewriter.getNamedAttr( + mlir::LLVM::LLVMDialect::getStructRetAttrName(), + convertTypeAttr(attr))); + } else { + convertedAttrs.push_back(attr); + } + } + newArgAttrsArray.emplace_back( + mlir::DictionaryAttr::get(rewriter.getContext(), convertedAttrs)); + } + llvmCall.setArgAttrsAttr(rewriter.getArrayAttr(newArgAttrsArray)); + } + if (mlir::ArrayAttr resAttrs = call.getResAttrsAttr()) + llvmCall.setResAttrsAttr(resAttrs); return mlir::success(); } }; diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp index c099a08ffd30a..5c9da0321bcc4 100644 --- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp +++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp @@ -534,19 +534,44 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase { } else if constexpr (std::is_same_v, fir::CallOp>) { fir::CallOp newCall; if (callOp.getCallee()) { - newCall = - rewriter->create(loc, *callOp.getCallee(), newResTys, newOpers); + newCall = rewriter->create(loc, *callOp.getCallee(), + newResTys, newOpers); } else { - // TODO: llvm dialect must be updated to propagate argument on - // attributes for indirect calls. See: - // https://discourse.llvm.org/t/should-llvm-callop-be-able-to-carry-argument-attributes-for-indirect-calls/75431 - if (hasByValOrSRetArgs(newInTyAndAttrs)) - TODO(loc, - "passing argument or result on the stack in indirect calls"); newOpers[0].setType(mlir::FunctionType::get( callOp.getContext(), mlir::TypeRange{newInTypes}.drop_front(dropFront), newResTys)); - newCall = rewriter->create(loc, newResTys, newOpers); + newCall = rewriter->create(loc, newResTys, newOpers); + // Set ABI argument attributes on call operation since they are not + // accessible via a FuncOp in indirect calls. + if (hasByValOrSRetArgs(newInTyAndAttrs)) { + llvm::SmallVector argAttrsArray; + for (const auto &arg : + llvm::ArrayRef( + newInTyAndAttrs) + .drop_front(dropFront)) { + mlir::NamedAttrList argAttrs; + const auto &attr = std::get(arg); + if (attr.isByVal()) { + mlir::Type elemType = + fir::dyn_cast_ptrOrBoxEleTy(std::get(arg)); + argAttrs.set(mlir::LLVM::LLVMDialect::getByValAttrName(), + mlir::TypeAttr::get(elemType)); + } else if (attr.isSRet()) { + mlir::Type elemType = + fir::dyn_cast_ptrOrBoxEleTy(std::get(arg)); + argAttrs.set(mlir::LLVM::LLVMDialect::getStructRetAttrName(), + mlir::TypeAttr::get(elemType)); + if (auto align = attr.getAlignment()) { + argAttrs.set(mlir::LLVM::LLVMDialect::getAlignAttrName(), + rewriter->getIntegerAttr( + rewriter->getIntegerType(32), align)); + } + } + argAttrsArray.emplace_back( + argAttrs.getDictionary(rewriter->getContext())); + } + newCall.setArgAttrsAttr(rewriter->getArrayAttr(argAttrsArray)); + } } LLVM_DEBUG(llvm::dbgs() << "replacing call with " << newCall << '\n'); if (wrap) diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index fa83aa380e489..7e50622db08c9 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -1121,11 +1121,12 @@ void fir::CallOp::print(mlir::OpAsmPrinter &p) { p.printOptionalAttrDict((*this)->getAttrs(), {fir::CallOp::getCalleeAttrNameStr(), - getFastmathAttrName(), getProcedureAttrsAttrName()}); - auto resultTypes{getResultTypes()}; - llvm::SmallVector argTypes( - llvm::drop_begin(getOperandTypes(), isDirect ? 0 : 1)); - p << " : " << mlir::FunctionType::get(getContext(), argTypes, resultTypes); + getFastmathAttrName(), getProcedureAttrsAttrName(), + getArgAttrsAttrName(), getResAttrsAttrName()}); + p << " : "; + mlir::call_interface_impl::printFunctionSignature( + p, getArgs().drop_front(isDirect ? 0 : 1).getTypes(), getArgAttrsAttr(), + /*isVariadic=*/false, getResultTypes(), getResAttrsAttr()); } mlir::ParseResult fir::CallOp::parse(mlir::OpAsmParser &parser, @@ -1142,7 +1143,6 @@ mlir::ParseResult fir::CallOp::parse(mlir::OpAsmParser &parser, attrs)) return mlir::failure(); - mlir::Type type; if (parser.parseOperandList(operands, mlir::OpAsmParser::Delimiter::Paren)) return mlir::failure(); @@ -1163,13 +1163,17 @@ mlir::ParseResult fir::CallOp::parse(mlir::OpAsmParser &parser, fmfAttrName, attrs)) return mlir::failure(); - if (parser.parseOptionalAttrDict(attrs) || parser.parseColon() || - parser.parseType(type)) + if (parser.parseOptionalAttrDict(attrs) || parser.parseColon()) return mlir::failure(); - - auto funcType = mlir::dyn_cast(type); - if (!funcType) + llvm::SmallVector argTypes; + llvm::SmallVector resTypes; + llvm::SmallVector argAttrs; + llvm::SmallVector resultAttrs; + if (mlir::call_interface_impl::parseFunctionSignature( + parser, argTypes, argAttrs, resTypes, resultAttrs)) return parser.emitError(parser.getNameLoc(), "expected function type"); + mlir::FunctionType funcType = + mlir::FunctionType::get(parser.getContext(), argTypes, resTypes); if (isDirect) { if (parser.resolveOperands(operands, funcType.getInputs(), parser.getNameLoc(), result.operands)) @@ -1183,8 +1187,11 @@ mlir::ParseResult fir::CallOp::parse(mlir::OpAsmParser &parser, parser.getNameLoc(), result.operands)) return mlir::failure(); } - result.addTypes(funcType.getResults()); result.attributes = attrs; + mlir::call_interface_impl::addArgAndResultAttrs( + parser.getBuilder(), result, argAttrs, resultAttrs, + getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name)); + result.addTypes(funcType.getResults()); return mlir::success(); } diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp index 67d918cc0f41c..719cb1b9d75aa 100644 --- a/flang/lib/Optimizer/Dialect/FIRType.cpp +++ b/flang/lib/Optimizer/Dialect/FIRType.cpp @@ -426,6 +426,11 @@ mlir::Type unwrapAllRefAndSeqType(mlir::Type ty) { } } +mlir::Type getFortranElementType(mlir::Type ty) { + return fir::unwrapSequenceType( + fir::unwrapPassByRefType(fir::unwrapRefType(ty))); +} + mlir::Type unwrapSeqOrBoxedSeqType(mlir::Type ty) { if (auto seqTy = mlir::dyn_cast(ty)) return seqTy.getEleTy(); @@ -1365,23 +1370,12 @@ void FIROpsDialect::registerTypes() { TypeDescType, fir::VectorType, fir::DummyScopeType>(); fir::ReferenceType::attachInterface< OpenMPPointerLikeModel>(*getContext()); - fir::ReferenceType::attachInterface< - OpenACCPointerLikeModel>(*getContext()); - fir::PointerType::attachInterface>( *getContext()); - fir::PointerType::attachInterface>( - *getContext()); - fir::HeapType::attachInterface>( *getContext()); - fir::HeapType::attachInterface>( - *getContext()); - fir::LLVMPointerType::attachInterface< OpenMPPointerLikeModel>(*getContext()); - fir::LLVMPointerType::attachInterface< - OpenACCPointerLikeModel>(*getContext()); } std::optional> diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt index 04d351ac265d6..1bfae603fd80d 100644 --- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt @@ -6,6 +6,7 @@ add_flang_library(FIROpenACCSupport DEPENDS FIRBuilder + FIRCodeGen FIRDialect FIRDialectSupport FIRSupport @@ -14,6 +15,7 @@ add_flang_library(FIROpenACCSupport LINK_LIBS FIRBuilder + FIRCodeGen FIRDialect FIRDialectSupport FIRSupport diff --git a/flang/lib/Optimizer/OpenACC/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/FIROpenACCTypeInterfaces.cpp index 94ab31de1763d..0ebc62e7f2fd6 100644 --- a/flang/lib/Optimizer/OpenACC/FIROpenACCTypeInterfaces.cpp +++ b/flang/lib/Optimizer/OpenACC/FIROpenACCTypeInterfaces.cpp @@ -15,6 +15,7 @@ #include "flang/Optimizer/Builder/DirectivesCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/HLFIRTools.h" +#include "flang/Optimizer/CodeGen/CGOps.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Optimizer/Dialect/FIRType.h" @@ -24,6 +25,7 @@ #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/Support/LLVM.h" +#include "llvm/ADT/TypeSwitch.h" namespace fir::acc { @@ -224,4 +226,145 @@ OpenACCMappableModel::generateAccBounds( return {}; } +static bool isScalarLike(mlir::Type type) { + return fir::isa_trivial(type) || fir::isa_ref_type(type); +} + +static bool isArrayLike(mlir::Type type) { + return mlir::isa(type); +} + +static bool isCompositeLike(mlir::Type type) { + return mlir::isa(type); +} + +template <> +mlir::acc::VariableTypeCategory +OpenACCMappableModel::getTypeCategory( + mlir::Type type, mlir::Value var) const { + return mlir::acc::VariableTypeCategory::array; +} + +template <> +mlir::acc::VariableTypeCategory +OpenACCMappableModel::getTypeCategory(mlir::Type type, + mlir::Value var) const { + + mlir::Type eleTy = fir::dyn_cast_ptrOrBoxEleTy(type); + + // If the type enclosed by the box is a mappable type, then have it + // provide the type category. + if (auto mappableTy = mlir::dyn_cast(eleTy)) + return mappableTy.getTypeCategory(var); + + // For all arrays, despite whether they are allocatable, pointer, assumed, + // etc, we'd like to categorize them as "array". + if (isArrayLike(eleTy)) + return mlir::acc::VariableTypeCategory::array; + + // We got here because we don't have an array nor a mappable type. At this + // point, we know we have a type that fits the "aggregate" definition since it + // is a type with a descriptor. Try to refine it by checking if it matches the + // "composite" definition. + if (isCompositeLike(eleTy)) + return mlir::acc::VariableTypeCategory::composite; + + // Even if we have a scalar type - simply because it is wrapped in a box + // we want to categorize it as "nonscalar". Anything else would've been + // non-scalar anyway. + return mlir::acc::VariableTypeCategory::nonscalar; +} + +static mlir::TypedValue +getBaseRef(mlir::TypedValue varPtr) { + // If there is no defining op - the unwrapped reference is the base one. + mlir::Operation *op = varPtr.getDefiningOp(); + if (!op) + return varPtr; + + // Look to find if this value originates from an interior pointer + // calculation op. + mlir::Value baseRef = + llvm::TypeSwitch(op) + .Case([&](auto op) { + // Get the base object. + return op.getMemref(); + }) + .Case([&](auto op) { + // Get the base array on which the coordinate is being applied. + return op.getMemref(); + }) + .Case([&](auto op) { + // For coordinate operation which is applied on derived type + // object, get the base object. + return op.getRef(); + }) + .Default([&](mlir::Operation *) { return varPtr; }); + + return mlir::cast>(baseRef); +} + +static mlir::acc::VariableTypeCategory +categorizePointee(mlir::Type pointer, + mlir::TypedValue varPtr, + mlir::Type varType) { + // FIR uses operations to compute interior pointers. + // So for example, an array element or composite field access to a float + // value would both be represented as !fir.ref. We do not want to treat + // such a reference as a scalar. Thus unwrap interior pointer calculations. + auto baseRef = getBaseRef(varPtr); + mlir::Type eleTy = baseRef.getType().getElementType(); + + if (auto mappableTy = mlir::dyn_cast(eleTy)) + return mappableTy.getTypeCategory(varPtr); + + if (isScalarLike(eleTy)) + return mlir::acc::VariableTypeCategory::scalar; + if (isArrayLike(eleTy)) + return mlir::acc::VariableTypeCategory::array; + if (isCompositeLike(eleTy)) + return mlir::acc::VariableTypeCategory::composite; + if (mlir::isa(eleTy)) + return mlir::acc::VariableTypeCategory::nonscalar; + // "pointers" - in the sense of raw address point-of-view, are considered + // scalars. However + if (mlir::isa(eleTy)) + return mlir::acc::VariableTypeCategory::scalar; + + // Without further checking, this type cannot be categorized. + return mlir::acc::VariableTypeCategory::uncategorized; +} + +template <> +mlir::acc::VariableTypeCategory +OpenACCPointerLikeModel::getPointeeTypeCategory( + mlir::Type pointer, mlir::TypedValue varPtr, + mlir::Type varType) const { + return categorizePointee(pointer, varPtr, varType); +} + +template <> +mlir::acc::VariableTypeCategory +OpenACCPointerLikeModel::getPointeeTypeCategory( + mlir::Type pointer, mlir::TypedValue varPtr, + mlir::Type varType) const { + return categorizePointee(pointer, varPtr, varType); +} + +template <> +mlir::acc::VariableTypeCategory +OpenACCPointerLikeModel::getPointeeTypeCategory( + mlir::Type pointer, mlir::TypedValue varPtr, + mlir::Type varType) const { + return categorizePointee(pointer, varPtr, varType); +} + +template <> +mlir::acc::VariableTypeCategory +OpenACCPointerLikeModel::getPointeeTypeCategory( + mlir::Type pointer, mlir::TypedValue varPtr, + mlir::Type varType) const { + return categorizePointee(pointer, varPtr, varType); +} + } // namespace fir::acc diff --git a/flang/lib/Optimizer/OpenACC/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/RegisterOpenACCExtensions.cpp index 34ea122f6b997..184a264c64325 100644 --- a/flang/lib/Optimizer/OpenACC/RegisterOpenACCExtensions.cpp +++ b/flang/lib/Optimizer/OpenACC/RegisterOpenACCExtensions.cpp @@ -22,6 +22,15 @@ void registerOpenACCExtensions(mlir::DialectRegistry ®istry) { fir::SequenceType::attachInterface>( *ctx); fir::BoxType::attachInterface>(*ctx); + + fir::ReferenceType::attachInterface< + OpenACCPointerLikeModel>(*ctx); + fir::PointerType::attachInterface< + OpenACCPointerLikeModel>(*ctx); + fir::HeapType::attachInterface>( + *ctx); + fir::LLVMPointerType::attachInterface< + OpenACCPointerLikeModel>(*ctx); }); } diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index d55ad9e603ffa..a5cda3b7cb875 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -325,8 +325,8 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm, pm.addPass(fir::createFunctionAttr( {framePointerKind, config.NoInfsFPMath, config.NoNaNsFPMath, - config.ApproxFuncFPMath, config.NoSignedZerosFPMath, - config.UnsafeFPMath})); + config.ApproxFuncFPMath, config.NoSignedZerosFPMath, config.UnsafeFPMath, + ""})); fir::addFIRToLLVMPass(pm, config); } diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp index 49c9a737b2c59..81d09e023db24 100644 --- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp +++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp @@ -503,11 +503,7 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp, funcOp->setLoc(builder.getFusedLoc({l}, spAttr)); funcOp.walk([&](fir::cg::XDeclareOp declOp) { - // FIXME: We currently dont handle variables that are not in the entry - // blocks of the fuctions. These may be variable or arguments used in the - // OpenMP target regions. - if (&funcOp.front() == declOp->getBlock()) - handleDeclareOp(declOp, fileAttr, spAttr, typeGen, symbolTable); + handleDeclareOp(declOp, fileAttr, spAttr, typeGen, symbolTable); }); // commonBlockMap ensures that we don't create multiple DICommonBlockAttr of // the same name in one function. But it is ok (rather required) to create diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index fd2893998205c..1d6fe6c8d4249 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -487,8 +487,7 @@ void OmpStructureChecker::HasInvalidDistributeNesting( violation = true; } else { // `distribute` region has to be strictly nested inside `teams` - if (!OmpDirectiveSet{llvm::omp::OMPD_teams, llvm::omp::OMPD_target_teams} - .test(GetContextParent().directive)) { + if (!llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) { violation = true; } } @@ -518,8 +517,7 @@ void OmpStructureChecker::HasInvalidLoopBinding( if (llvm::omp::Directive::OMPD_loop == beginDir.v && CurrentDirectiveIsNested() && - OmpDirectiveSet{llvm::omp::OMPD_teams, llvm::omp::OMPD_target_teams}.test( - GetContextParent().directive)) { + llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) { teamsBindingChecker( "`BIND(TEAMS)` must be specified since the `LOOP` region is " "strictly nested inside a `TEAMS` region."_err_en_US); @@ -726,7 +724,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) { HasInvalidDistributeNesting(x); HasInvalidLoopBinding(x); if (CurrentDirectiveIsNested() && - llvm::omp::topTeamsSet.test(GetContextParent().directive)) { + llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) { HasInvalidTeamsNesting(beginDir.v, beginDir.source); } if ((beginDir.v == llvm::omp::Directive::OMPD_distribute_parallel_do_simd) || @@ -1169,7 +1167,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPBlockConstruct &x) { } if (CurrentDirectiveIsNested()) { - if (llvm::omp::topTeamsSet.test(GetContextParent().directive)) { + if (llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) { HasInvalidTeamsNesting(beginDir.v, beginDir.source); } if (GetContext().directive == llvm::omp::Directive::OMPD_master) { diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt index 7cc720e2df9af..1f4ee69598918 100644 --- a/flang/runtime/CMakeLists.txt +++ b/flang/runtime/CMakeLists.txt @@ -99,6 +99,15 @@ else() set(NO_LTO_FLAGS "") endif() +# based on AddLLVM.cmake +if (LLVM_COMPILER_IS_GCC_COMPATIBLE) + set(NO_RTTI_FLAGS "-fno-exceptions -fno-rtti") +elseif (MSVC) + set(NO_RTTI_FLAGS "/EHs-c- /GR-") +elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL") + set(NO_RTTI_FLAGS "-qnoeh -qnortti") +endif () + configure_file(config.h.cmake config.h) # include_directories is used here instead of target_include_directories # because add_flang_library creates multiple objects (STATIC/SHARED, OBJECT) @@ -107,6 +116,7 @@ include_directories(AFTER ${CMAKE_CURRENT_BINARY_DIR}) append(${NO_LTO_FLAGS} CMAKE_C_FLAGS) append(${NO_LTO_FLAGS} CMAKE_CXX_FLAGS) +append(${NO_RTTI_FLAGS} CMAKE_CXX_FLAGS) # Disable libstdc++/libc++ assertions, even in an LLVM_ENABLE_ASSERTIONS build, # to avoid an unwanted dependency on libstdc++/libc++.so. @@ -189,6 +199,8 @@ include(AddFlangOffloadRuntime) # List of files that are buildable for all devices. set(supported_files + ${FLANG_SOURCE_DIR}/lib/Decimal/binary-to-decimal.cpp + ${FLANG_SOURCE_DIR}/lib/Decimal/decimal-to-binary.cpp ISO_Fortran_binding.cpp allocatable.cpp allocator-registry.cpp diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-2.fir b/flang/test/Analysis/AliasAnalysis/alias-analysis-2.fir index ca97c5900281d..24cfaf6ed7ecc 100644 --- a/flang/test/Analysis/AliasAnalysis/alias-analysis-2.fir +++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-2.fir @@ -47,13 +47,15 @@ // CHECK-DAG: arg2.load#0 <-> arg2.addr#0: MustAlias // CHECK-DAG: boxp1.addr#0 <-> arg2.addr#0: MayAlias -// TODO: Can the address in a pointer alias the address of a pointer, even when the +// TODO: Can the address in a pointer alias the address of a pointer, when the // pointer has no box. Should this be NoAlias? -// T3: CHECK-DAG: p1.addr#0 <-> p1.tgt#0: MayAlias +// T3 from . +// CHECK-DAG: p1.addr#0 <-> p1.tgt#0: MayAlias // The addresses stored in two different pointers can alias, even if one has no // box. In this program, they happen to be the same address. -// T4: CHECK-DAG: p1.tgt#0 <-> boxp1.addr#0: MayAlias +// T4: +// CHECK-DAG: p1.tgt#0 <-> boxp1.addr#0: MayAlias func.func @_QFPtest(%arg0: !fir.ref {fir.bindc_name = "v1", fir.target}, %arg1: !fir.ref {fir.bindc_name = "v2", fir.target}, %arg2: !fir.ref>> ) attributes {test.ptr = "func"} { diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-target.fir b/flang/test/Analysis/AliasAnalysis/alias-analysis-target.fir new file mode 100644 index 0000000000000..8e88b508d56e3 --- /dev/null +++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-target.fir @@ -0,0 +1,82 @@ +// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' 2>&1 | FileCheck %s + +// The test was obtained from +// bbc test.f90 -emit-fir +// module mod +// real, pointer :: p0 +// real, allocatable :: alloc +// real, allocatable, target :: t_alloc +// real, target :: t +// real :: v +// end module +// +// subroutine test(n) +// use mod +// integer :: n +// real r1 +// p0 => t_alloc +// v = alloc +// r1 = p0 +// end subroutine test + +// Checking that aliasing can only happen with an entity with the target attribute +// +// CHECK-DAG: r1#0 <-> t_alloc#0: NoAlias +// CHECK-DAG: r1#0 <-> alloc#0: NoAlias +// CHECK-DAG: t_alloc#0 <-> alloc#0: NoAlias +// CHECK-DAG: r1#0 <-> p0.ptr#0: NoAlias +// CHECK-DAG: t_alloc#0 <-> p0.ptr#0: MayAlias +// CHECK-DAG: alloc#0 <-> p0.ptr#0: NoAlias + +fir.global @_QMmodEalloc : !fir.box> { + %0 = fir.zero_bits !fir.heap + %1 = fir.embox %0 : (!fir.heap) -> !fir.box> + fir.has_value %1 : !fir.box> +} +fir.global @_QMmodEp0 : !fir.box> { + %0 = fir.zero_bits !fir.ptr + %1 = fir.embox %0 : (!fir.ptr) -> !fir.box> + fir.has_value %1 : !fir.box> +} +fir.global @_QMmodEt target : f32 { + %0 = fir.zero_bits f32 + fir.has_value %0 : f32 +} +fir.global @_QMmodEt_alloc target : !fir.box> { + %0 = fir.zero_bits !fir.heap + %1 = fir.embox %0 : (!fir.heap) -> !fir.box> + fir.has_value %1 : !fir.box> +} +fir.global @_QMmodEv : f32 { + %0 = fir.zero_bits f32 + fir.has_value %0 : f32 +} +func.func @_QPtest(%arg0: !fir.ref {fir.bindc_name = "n"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.address_of(@_QMmodEalloc) : !fir.ref>> + %2 = fir.declare %1 {fortran_attrs = #fir.var_attrs, uniq_name = "_QMmodEalloc"} : (!fir.ref>>) -> !fir.ref>> + %3 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFtestEn"} : (!fir.ref, !fir.dscope) -> !fir.ref + %4 = fir.address_of(@_QMmodEp0) : !fir.ref>> + %5 = fir.declare %4 {fortran_attrs = #fir.var_attrs, uniq_name = "_QMmodEp0"} : (!fir.ref>>) -> !fir.ref>> + %6 = fir.alloca f32 {bindc_name = "r1", uniq_name = "_QFtestEr1"} + %7 = fir.declare %6 {test.ptr="r1", uniq_name = "_QFtestEr1"} : (!fir.ref) -> !fir.ref + %8 = fir.address_of(@_QMmodEt) : !fir.ref + %9 = fir.declare %8 {fortran_attrs = #fir.var_attrs, uniq_name = "_QMmodEt"} : (!fir.ref) -> !fir.ref + %10 = fir.address_of(@_QMmodEt_alloc) : !fir.ref>> + %11 = fir.declare %10 {fortran_attrs = #fir.var_attrs, uniq_name = "_QMmodEt_alloc"} : (!fir.ref>>) -> !fir.ref>> + %12 = fir.address_of(@_QMmodEv) : !fir.ref + %13 = fir.declare %12 {uniq_name = "_QMmodEv"} : (!fir.ref) -> !fir.ref + %14 = fir.load %11 : !fir.ref>> + %15 = fir.box_addr %14 {test.ptr="t_alloc"}: (!fir.box>) -> !fir.heap + %16 = fir.embox %15 : (!fir.heap) -> !fir.box> + fir.store %16 to %5 : !fir.ref>> + %17 = fir.load %2 : !fir.ref>> + %18 = fir.box_addr %17 {test.ptr="alloc"} : (!fir.box>) -> !fir.heap + %19 = fir.load %18 : !fir.heap + fir.store %19 to %13 : !fir.ref + %20 = fir.load %5 : !fir.ref>> + %21 = fir.box_addr %20 {test.ptr="p0.ptr"} : (!fir.box>) -> !fir.ptr + %22 = fir.load %21 : !fir.ptr + fir.store %22 to %7 : !fir.ref + return +} diff --git a/flang/test/Fir/OpenACC/openacc-mappable.fir b/flang/test/Fir/OpenACC/openacc-mappable.fir index 438cb29b991c7..005f002c491a5 100644 --- a/flang/test/Fir/OpenACC/openacc-mappable.fir +++ b/flang/test/Fir/OpenACC/openacc-mappable.fir @@ -19,7 +19,9 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, // CHECK: Visiting: %{{.*}} = acc.copyin var(%{{.*}} : !fir.box>) -> !fir.box> {name = "arr", structured = false} // CHECK: Mappable: !fir.box> +// CHECK: Type category: array // CHECK: Size: 40 // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref>) -> !fir.ref> {name = "arr", structured = false} // CHECK: Mappable: !fir.array<10xf32> +// CHECK: Type category: array // CHECK: Size: 40 diff --git a/flang/test/Fir/OpenACC/openacc-type-categories.f90 b/flang/test/Fir/OpenACC/openacc-type-categories.f90 new file mode 100644 index 0000000000000..c25c38422b755 --- /dev/null +++ b/flang/test/Fir/OpenACC/openacc-type-categories.f90 @@ -0,0 +1,49 @@ +! RUN: bbc -fopenacc -emit-hlfir %s -o - | fir-opt -pass-pipeline='builtin.module(test-fir-openacc-interfaces)' --mlir-disable-threading 2>&1 | FileCheck %s + +program main + real :: scalar + real, allocatable :: scalaralloc + type tt + real :: field + real :: fieldarray(10) + end type tt + type(tt) :: ttvar + real :: arrayconstsize(10) + real, allocatable :: arrayalloc(:) + complex :: complexvar + character*1 :: charvar + + !$acc enter data copyin(scalar, scalaralloc, ttvar, arrayconstsize, arrayalloc) + !$acc enter data copyin(complexvar, charvar, ttvar%field, ttvar%fieldarray, arrayconstsize(1)) +end program + +! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "scalar", structured = false} +! CHECK: Pointer-like: !fir.ref +! CHECK: Type category: scalar +! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "scalaralloc", structured = false} +! CHECK: Pointer-like: !fir.ref>> +! CHECK: Type category: nonscalar +! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "ttvar", structured = false} +! CHECK: Pointer-like: !fir.ref}>> +! CHECK: Type category: composite +! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "arrayconstsize", structured = false} +! CHECK: Pointer-like: !fir.ref> +! CHECK: Type category: array +! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "arrayalloc", structured = false} +! CHECK: Pointer-like: !fir.ref>>> +! CHECK: Type category: array +! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "complexvar", structured = false} +! CHECK: Pointer-like: !fir.ref> +! CHECK: Type category: scalar +! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "charvar", structured = false} +! CHECK: Pointer-like: !fir.ref> +! CHECK: Type category: nonscalar +! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "ttvar%field", structured = false} +! CHECK: Pointer-like: !fir.ref +! CHECK: Type category: composite +! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "ttvar%fieldarray", structured = false} +! CHECK: Pointer-like: !fir.ref> +! CHECK: Type category: array +! CHECK: Visiting: {{.*}} acc.copyin {{.*}} {name = "arrayconstsize(1)", structured = false} +! CHECK: Pointer-like: !fir.ref> +! CHECK: Type category: array diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir index e00fc9d6649c4..ba9b08dad7764 100644 --- a/flang/test/Fir/alloc.fir +++ b/flang/test/Fir/alloc.fir @@ -19,14 +19,14 @@ func.func @alloca_scalars_nonchar() -> !fir.ref { } // CHECK-LABEL: define ptr @allocmem_scalar_nonchar( -// CHECK: call ptr @malloc(i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)) +// CHECK: call ptr @malloc(i64 4) func.func @allocmem_scalar_nonchar() -> !fir.heap { %1 = fir.allocmem i32 return %1 : !fir.heap } // CHECK-LABEL: define ptr @allocmem_scalars_nonchar( -// CHECK: call ptr @malloc(i64 mul (i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i64 100)) +// CHECK: call ptr @malloc(i64 400) func.func @allocmem_scalars_nonchar() -> !fir.heap { %0 = arith.constant 100 : index %1 = fir.allocmem i32, %0 @@ -48,14 +48,14 @@ func.func @alloca_scalar_char_kind() -> !fir.ref> { } // CHECK-LABEL: define ptr @allocmem_scalar_char( -// CHECK: call ptr @malloc(i64 ptrtoint (ptr getelementptr ([10 x i8], ptr null, i32 1) to i64)) +// CHECK: call ptr @malloc(i64 10) func.func @allocmem_scalar_char() -> !fir.heap> { %1 = fir.allocmem !fir.char<1,10> return %1 : !fir.heap> } // CHECK-LABEL: define ptr @allocmem_scalar_char_kind( -// CHECK: call ptr @malloc(i64 ptrtoint (ptr getelementptr ([10 x i16], ptr null, i32 1) to i64)) +// CHECK: call ptr @malloc(i64 20) func.func @allocmem_scalar_char_kind() -> !fir.heap> { %1 = fir.allocmem !fir.char<2,10> return %1 : !fir.heap> @@ -82,7 +82,7 @@ func.func @alloca_scalar_dynchar_kind(%l : i32) -> !fir.ref> { // CHECK-LABEL: define ptr @allocmem_scalar_dynchar( // CHECK-SAME: i32 %[[len:.*]]) // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64 -// CHECK: %[[mul2:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[mul1]] +// CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]] // CHECK: call ptr @malloc(i64 %[[mul2]]) func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap> { %1 = fir.allocmem !fir.char<1,?>(%l : i32) @@ -92,7 +92,7 @@ func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap> { // CHECK-LABEL: define ptr @allocmem_scalar_dynchar_kind( // CHECK-SAME: i32 %[[len:.*]]) // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64 -// CHECK: %[[mul2:.*]] = mul i64 ptrtoint (ptr getelementptr (i16, ptr null, i32 1) to i64), %[[mul1]] +// CHECK: %[[mul2:.*]] = mul i64 2, %[[mul1]] // CHECK: call ptr @malloc(i64 %[[mul2]]) func.func @allocmem_scalar_dynchar_kind(%l : i32) -> !fir.heap>{ %1 = fir.allocmem !fir.char<2,?>(%l : i32) @@ -131,14 +131,14 @@ func.func @alloca_array_of_dynchar(%l: i32) -> !fir.ref !fir.heap> { %1 = fir.allocmem !fir.array<3x3xi32> return %1 : !fir.heap> } // CHECK-LABEL: define ptr @allocmem_array_of_char( -// CHECK: call ptr @malloc(i64 ptrtoint (ptr getelementptr ([3 x [3 x [10 x i8]]], ptr null, i32 1) to i64)) +// CHECK: call ptr @malloc(i64 90) func.func @allocmem_array_of_char() -> !fir.heap>> { %1 = fir.allocmem !fir.array<3x3x!fir.char<1,10>> return %1 : !fir.heap>> @@ -147,7 +147,7 @@ func.func @allocmem_array_of_char() -> !fir.heap> // CHECK-LABEL: define ptr @allocmem_array_of_dynchar( // CHECK-SAME: i32 %[[len:.*]]) // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64 -// CHECK: %[[mul2:.*]] = mul i64 mul (i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), i64 9), %[[mul1]] +// CHECK: %[[mul2:.*]] = mul i64 9, %[[mul1]] // CHECK: call ptr @malloc(i64 %[[mul2]]) func.func @allocmem_array_of_dynchar(%l: i32) -> !fir.heap>> { %1 = fir.allocmem !fir.array<3x3x!fir.char<1,?>>(%l : i32) @@ -175,7 +175,7 @@ func.func @alloca_dynarray_of_nonchar2(%e: index) -> !fir.ref !fir.heap> { %1 = fir.allocmem !fir.array<3x?xi32>, %e @@ -184,7 +184,7 @@ func.func @allocmem_dynarray_of_nonchar(%e: index) -> !fir.heap !fir.heap> { @@ -213,7 +213,7 @@ func.func @alloca_dynarray_of_char2(%e : index) -> !fir.ref !fir.heap>> { %1 = fir.allocmem !fir.array<3x?x!fir.char<2,10>>, %e @@ -222,7 +222,7 @@ func.func @allocmem_dynarray_of_char(%e : index) -> !fir.heap !fir.heap>> { @@ -255,7 +255,7 @@ func.func @alloca_dynarray_of_dynchar2(%l: i32, %e : index) -> !fir.ref !fir.heap>> { @@ -266,7 +266,7 @@ func.func @allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> !fir.heap !fir.r // CHECK-LABEL: define ptr @allocmem_array_with_holes_nonchar( // CHECK-SAME: i64 %[[e1:.*]], i64 %[[e2:.*]]) -// CHECK: %[[a:.*]] = mul i64 mul (i64 ptrtoint{{.*}} 15), %[[e1]] +// CHECK: %[[a:.*]] = mul i64 240, %[[e1]] // CHECK: %[[b:.*]] = mul i64 %3, %[[e2]] // CHECK: call ptr @malloc(i64 %[[b]]) func.func @allocmem_array_with_holes_nonchar(%0 : index, %1 : index) -> !fir.heap> { @@ -316,7 +316,7 @@ func.func @allocmem_array_with_holes_nonchar(%0 : index, %1 : index) -> !fir.hea // CHECK-LABEL: define ptr @allocmem_array_with_holes_char( // CHECK-SAME: i64 %[[e:.*]]) -// CHECK: %[[mul:.*]] = mul i64 mul (i64 ptrtoint (ptr getelementptr ([3 x [10 x i16]], ptr null, i32 1) to i64), i64 4), %[[e]] +// CHECK: %[[mul:.*]] = mul i64 240, %[[e]] // CHECK: call ptr @malloc(i64 %[[mul]]) func.func @allocmem_array_with_holes_char(%e: index) -> !fir.heap>> { %1 = fir.allocmem !fir.array<3x?x4x!fir.char<2,10>>, %e @@ -325,7 +325,7 @@ func.func @allocmem_array_with_holes_char(%e: index) -> !fir.heap !fir.heap>> { diff --git a/flang/test/Fir/arrexp.fir b/flang/test/Fir/arrexp.fir index 69fc77deb57e3..924c1fab8d84b 100644 --- a/flang/test/Fir/arrexp.fir +++ b/flang/test/Fir/arrexp.fir @@ -145,7 +145,7 @@ func.func @f6(%arg0: !fir.box>, %arg1: f32) { // CHECK: %[[EXT_GEP:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 7, i64 0, i32 1 // CHECK: %[[EXTENT:.*]] = load i64, ptr %[[EXT_GEP]] - // CHECK: %[[SIZE:.*]] = mul i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), %[[EXTENT]] + // CHECK: %[[SIZE:.*]] = mul i64 4, %[[EXTENT]] // CHECK: %[[MALLOC:.*]] = call ptr @malloc(i64 %[[SIZE]]) %1 = fir.slice %c2, %c10, %c1 : (index, index, index) -> !fir.slice<1> %2 = fir.array_load %arg0 [%1] : (!fir.box>, !fir.slice<1>) -> !fir.array diff --git a/flang/test/Fir/box.fir b/flang/test/Fir/box.fir index d4a5157888367..5e931a2e0d9aa 100644 --- a/flang/test/Fir/box.fir +++ b/flang/test/Fir/box.fir @@ -1,7 +1,7 @@ // RUN: tco -o - %s | FileCheck %s // Global box initialization (test must come first because llvm globals are emitted first). -// CHECK-LABEL: @globalx = internal global { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 9, i8 2, i8 0 } +// CHECK-LABEL: @globalx = internal global { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 4, i32 20240719, i8 0, i8 9, i8 2, i8 0 } fir.global internal @globalx : !fir.box> { %c0 = arith.constant 0 : index %0 = fir.convert %c0 : (index) -> !fir.heap @@ -9,7 +9,7 @@ fir.global internal @globalx : !fir.box> { fir.has_value %1 : !fir.box> } -// CHECK-LABEL: @globaly = internal global { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr null, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 27, i8 2, i8 0,{{.*}}[3 x i64] [i64 1, i64 0, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64)] +// CHECK-LABEL: @globaly = internal global { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr null, i64 4, i32 20240719, i8 1, i8 27, i8 2, i8 0,{{.*}}[3 x i64] [i64 1, i64 0, i64 4] fir.global internal @globaly : !fir.box>> { %c0 = arith.constant 0 : index %0 = fir.convert %c0 : (index) -> !fir.heap> @@ -27,7 +27,7 @@ func.func private @ga(%b : !fir.box>) // CHECK: (ptr captures(none) %[[ARG:.*]]) func.func @f(%a : !fir.ref) { // CHECK: %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 } - // CHECK: %[[INS0:.*]] = insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 27, i8 0, i8 0 }, ptr %[[ARG]], 0 + // CHECK: %[[INS0:.*]] = insertvalue {{.*}} { ptr undef, i64 4, i32 20240719, i8 0, i8 27, i8 0, i8 0 }, ptr %[[ARG]], 0 // CHECK: store {{.*}} %[[INS0]], {{.*}} %[[DESC]] %b = fir.embox %a : (!fir.ref) -> !fir.box @@ -44,7 +44,7 @@ func.func @fa(%a : !fir.ref>) { %c1 = arith.constant 1 : index %c100 = arith.constant 100 : index %d = fir.shape %c100 : (index) -> !fir.shape<1> - // CHECK: %[[INS70:.*]] = insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 27, i8 0, i8 0, {{.*}} }, ptr %{{.*}}, 0 + // CHECK: %[[INS70:.*]] = insertvalue {{.*}} { ptr undef, i64 4, i32 20240719, i8 1, i8 27, i8 0, i8 0, {{.*}} }, ptr %{{.*}}, 0 %b = fir.embox %c(%d) : (!fir.ref>, !fir.shape<1>) -> !fir.box> // CHECK: call void @ga( fir.call @ga(%b) : (!fir.box>) -> () @@ -57,7 +57,7 @@ func.func @fa(%a : !fir.ref>) { // CHECK-SAME: ptr captures(none) %[[res:.*]], ptr captures(none) %[[arg0:.*]], i64 %[[arg1:.*]]) func.func @b1(%arg0 : !fir.ref>, %arg1 : index) -> !fir.box> { // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 } - // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]] + // CHECK: %[[size:.*]] = mul i64 1, %[[arg1]] // CHECK: insertvalue {{.*}} undef, i64 %[[size]], 1 // CHECK: insertvalue {{.*}} i32 20240719, 2 // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0 @@ -74,8 +74,8 @@ func.func @b1(%arg0 : !fir.ref>, %arg1 : index) -> !fir.box>>, %arg1 : index) -> !fir.box>> { %1 = fir.shape %arg1 : (index) -> !fir.shape<1> // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } - // CHECK: insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), i32 20240719, i8 1, i8 40, i8 0, i8 0, {{.*}} }, i64 %[[arg1]], 7, 0, 1 - // CHECK: insertvalue {{.*}} %{{.*}}, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), 7, 0, 2 + // CHECK: insertvalue {{.*}} { ptr undef, i64 5, i32 20240719, i8 1, i8 40, i8 0, i8 0, {{.*}} }, i64 %[[arg1]], 7, 0, 1 + // CHECK: insertvalue {{.*}} %{{.*}}, i64 5, 7, 0, 2 // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0 %2 = fir.embox %arg0(%1) : (!fir.ref>>, !fir.shape<1>) -> !fir.box>> // CHECK: store {{.*}}, ptr %[[alloca]] @@ -89,7 +89,7 @@ func.func @b2(%arg0 : !fir.ref>>, %arg1 : index) -> func.func @b3(%arg0 : !fir.ref>>, %arg1 : index, %arg2 : index) -> !fir.box>> { %1 = fir.shape %arg2 : (index) -> !fir.shape<1> // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } - // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]] + // CHECK: %[[size:.*]] = mul i64 1, %[[arg1]] // CHECK: insertvalue {{.*}} i64 %[[size]], 1 // CHECK: insertvalue {{.*}} i32 20240719, 2 // CHECK: insertvalue {{.*}} i64 %[[arg2]], 7, 0, 1 @@ -108,7 +108,7 @@ func.func @b4(%arg0 : !fir.ref>>, %arg1 : index) -> %c_7 = arith.constant 7 : index %1 = fir.shape %c_7 : (index) -> !fir.shape<1> // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } - // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]] + // CHECK: %[[size:.*]] = mul i64 1, %[[arg1]] // CHECK: insertvalue {{.*}} i64 %[[size]], 1 // CHECK: insertvalue {{.*}} i32 20240719, 2 // CHECK: insertvalue {{.*}} i64 7, 7, 0, 1 @@ -154,12 +154,12 @@ func.func @box6(%0 : !fir.ref>, %1 : index, %2 : index) // CHECK: %[[sdp2:.*]] = sdiv i64 %[[dp2]], 2 // CHECK: %[[cmp:.*]] = icmp sgt i64 %[[sdp2]], 0 // CHECK: %[[extent:.*]] = select i1 %[[cmp]], i64 %[[sdp2]], i64 0 - // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 2, i8 27, i8 0, i8 0, [2 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 undef, i64 undef], [3 x i64] undef] }, i64 %[[extent]], 7, 0, 1 - // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %{{.*}}, i64 mul (i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i64 200), 7, 0, 2 + // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } { ptr undef, i64 4, i32 20240719, i8 2, i8 27, i8 0, i8 0, [2 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 undef, i64 undef], [3 x i64] undef] }, i64 %[[extent]], 7, 0, 1 + // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %{{.*}}, i64 800, 7, 0, 2 // CHECK: %[[op25:.*]] = add i64 25000, %[[i100p40]] // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %{{.*}}, i64 1, 7, 1, 0 // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %{{.*}}, i64 4, 7, 1, 1 - // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %{{.*}}, i64 mul (i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i64 30000), 7, 1, 2 + // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %{{.*}}, i64 120000, 7, 1, 2 // CHECK: %[[op300:.*]] = add i64 300000, %[[op25]] // CHECK: %[[ptr:.*]] = getelementptr float, ptr %[[ARG0]], i64 %[[op300]] // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %{{.*}}, ptr %[[ptr]], 0 diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir index 6d7a4a09918e5..8727c0ab08e70 100644 --- a/flang/test/Fir/convert-to-llvm.fir +++ b/flang/test/Fir/convert-to-llvm.fir @@ -2853,3 +2853,35 @@ gpu.module @cuda_device_mod { // CHECK: llvm.func @malloc(i64) -> !llvm.ptr // CHECK: llvm.call @malloc // CHECK: lvm.call @free + +// ----- + +func.func private @somefunc(i32, !fir.ref) + +// CHECK-LABEL: @test_call_arg_attrs_direct +func.func @test_call_arg_attrs_direct(%arg0: i32, %arg1: !fir.ref) { + // CHECK: llvm.call @somefunc(%{{.*}}, %{{.*}}) : (i32, !llvm.ptr {llvm.byval = i64}) -> () + fir.call @somefunc(%arg0, %arg1) : (i32, !fir.ref {llvm.byval = i64}) -> () + return +} + +// CHECK-LABEL: @test_call_arg_attrs_indirect +func.func @test_call_arg_attrs_indirect(%arg0: i16, %arg1: (i16)-> i16) -> i16 { + // CHECK: llvm.call %arg1(%{{.*}}) : !llvm.ptr, (i16 {llvm.noundef, llvm.signext}) -> (i16 {llvm.signext}) + %0 = fir.call %arg1(%arg0) : (i16 {llvm.noundef, llvm.signext}) -> (i16 {llvm.signext}) + return %0 : i16 +} + +// CHECK-LABEL: @test_byval +func.func @test_byval(%arg0: (!fir.ref}>>, f64) -> (), %arg1: !fir.ref}>>, %arg2: f64) { + // llvm.call %{{.*}}(%{{.*}}, %{{.*}}) : !llvm.ptr, (!llvm.ptr {llvm.byval = !llvm.struct<"t", (array<5 x f64>)>}, f64) -> () + fir.call %arg0(%arg1, %arg2) : (!fir.ref}>> {llvm.byval = !fir.type}>}, f64) -> () + return +} + +// CHECK-LABEL: @test_sret +func.func @test_sret(%arg0: (!fir.ref}>>, f64) -> (), %arg1: !fir.ref}>>, %arg2: f64) { + // llvm.call %{{.*}}(%{{.*}}, %{{.*}}) : !llvm.ptr, (!llvm.ptr {llvm.sret = !llvm.struct<"t", (array<5 x f64>)>}, f64) -> () + fir.call %arg0(%arg1, %arg2) : (!fir.ref}>> {llvm.sret = !fir.type}>}, f64) -> () + return +} diff --git a/flang/test/Fir/embox.fir b/flang/test/Fir/embox.fir index 0b8bc3fd4be14..18b5efbc6a0e4 100644 --- a/flang/test/Fir/embox.fir +++ b/flang/test/Fir/embox.fir @@ -13,8 +13,8 @@ func.func @_QPtest_slice() { // CHECK: %[[a2:.*]] = alloca [20 x i32], i64 1, align 4 // CHECK: %[[a3:.*]] = getelementptr [20 x i32], ptr %[[a2]], i64 0, i64 0 // CHECK: %[[a4:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } -// CHECK: { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] -// CHECK: [i64 1, i64 5, i64 mul (i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i64 2)]] }, ptr %[[a3]], 0 +// CHECK: { ptr undef, i64 4, i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] +// CHECK: [i64 1, i64 5, i64 8]] }, ptr %[[a3]], 0 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[a4]], ptr %[[a1]], align 8 // CHECK: call void @_QPtest_callee(ptr %[[a1]]) %c20 = arith.constant 20 : index @@ -40,9 +40,8 @@ func.func @_QPtest_dt_slice() { // CHECK: %[[a3:.*]] = alloca [20 x %_QFtest_dt_sliceTt], i64 1, align 8 // CHECK: %[[a4:.*]] = getelementptr [20 x %_QFtest_dt_sliceTt], ptr %[[a3]], i64 0, i64 0, i32 0 // CHECK: %[[a5:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } -// CHECK-SAME: { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] -// CHECK-SAME: [i64 1, i64 5, i64 mul -// CHECK-SAME: (i64 ptrtoint (ptr getelementptr (%_QFtest_dt_sliceTt, ptr null, i32 1) to i64), i64 2)]] } +// CHECK-SAME: { ptr undef, i64 4, i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] +// CHECK-SAME: [i64 1, i64 5, i64 16 // CHECK-SAME: , ptr %[[a4]], 0 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[a5]], ptr %[[a1]], align 8 @@ -75,7 +74,7 @@ func.func @emboxSubstring(%arg0: !fir.ref>>) { %1 = fir.slice %c1, %c2, %c1, %c1, %c3, %c1 substr %c1_i64, %c2_i64 : (index, index, index, index, index, index, i64, i64) -> !fir.slice<2> %2 = fir.embox %arg0(%0) [%1] : (!fir.ref>>, !fir.shape<2>, !fir.slice<2>) -> !fir.box>> // CHECK: %[[addr:.*]] = getelementptr [3 x [2 x [4 x i8]]], ptr %[[arg0]], i64 0, i64 0, i64 0, i64 1 - // CHECK: insertvalue {[[descriptorType:.*]]} { ptr undef, i64 mul (i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), i64 2), i32 20240719, i8 2, i8 40, i8 0, i8 0 + // CHECK: insertvalue {[[descriptorType:.*]]} { ptr undef, i64 2, i32 20240719, i8 2, i8 40, i8 0, i8 0 // CHECK-SAME: [2 x [3 x i64]] [{{\[}}3 x i64] [i64 1, i64 2, i64 4], [3 x i64] [i64 1, i64 3, i64 8]] } // CHECK-SAME: ptr %[[addr]], 0 @@ -98,7 +97,7 @@ func.func @fir_dev_issue_1416(%arg0: !fir.ref>, %low: index // CHECK: %[[offset:.*]] = add i64 %[[mul]], 0 // CHECK: %[[addr:.*]] = getelementptr [40 x float], ptr %0, i64 %[[offset]], i64 0 // CHECK: %[[box:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } -// CHECK-SAME: { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 27, i8 0, i8 0, [1 x [3 x i64]] [{{.*}} [i64 1, i64 40, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64)]] }, ptr %[[addr]], 0 +// CHECK-SAME: { ptr undef, i64 4, i32 20240719, i8 1, i8 27, i8 0, i8 0, [1 x [3 x i64]] [{{.*}} [i64 1, i64 40, i64 4]] }, ptr %[[addr]], 0 %3 = fir.embox %arg0(%1) [%2] : (!fir.ref>, !fir.shapeshift<2>, !fir.slice<2>) -> !fir.box> fir.call @do_something(%3) : (!fir.box>) -> () return @@ -126,4 +125,4 @@ func.func @_QPtest_allocator2() { return } -// CHECK: %{{.*}} = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 6 +// CHECK: %{{.*}} = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 4, i32 20240719, i8 1, i8 9, i8 0, i8 6 diff --git a/flang/test/Fir/fir-ops.fir b/flang/test/Fir/fir-ops.fir index 5a30858511f0c..1bfcb3a9f3dc8 100644 --- a/flang/test/Fir/fir-ops.fir +++ b/flang/test/Fir/fir-ops.fir @@ -913,3 +913,23 @@ func.func @test_is_assumed_size(%arg0: !fir.class>, %arg1 : ! // CHECK-SAME: %[[B:.*]]: !fir.box>) // CHECK: fir.is_assumed_size %[[A]] : (!fir.class>) -> i1 // CHECK: fir.is_assumed_size %[[B]] : (!fir.box>) -> i1 + +func.func private @somefunc(i32, !fir.ref) + +// CHECK-LABEL: @test_call_arg_attrs_direct +// CHECK-SAME: %[[VAL_0:.*]]: i32, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.ref) { +func.func @test_call_arg_attrs_direct(%arg0: i32, %arg1: !fir.ref) { + // CHECK: fir.call @somefunc(%[[VAL_0]], %[[VAL_1]]) : (i32, !fir.ref {llvm.byval = i64}) -> () + fir.call @somefunc(%arg0, %arg1) : (i32, !fir.ref {llvm.byval = i64}) -> () + return +} + +// CHECK-LABEL: @test_call_arg_attrs_indirect +// CHECK-SAME: %[[VAL_0:.*]]: i16, +// CHECK-SAME: %[[VAL_1:.*]]: (i16) -> i16) -> i16 { +func.func @test_call_arg_attrs_indirect(%arg0: i16, %arg1: (i16)-> i16) -> i16 { + // CHECK: fir.call %[[VAL_1]](%[[VAL_0]]) : (i16 {llvm.noundef, llvm.signext}) -> (i16 {llvm.signext}) + %0 = fir.call %arg1(%arg0) : (i16 {llvm.noundef, llvm.signext}) -> (i16 {llvm.signext}) + return %0 : i16 +} diff --git a/flang/test/Fir/ignore-missing-type-descriptor.fir b/flang/test/Fir/ignore-missing-type-descriptor.fir index 2958918156410..f9dcb7db77afe 100644 --- a/flang/test/Fir/ignore-missing-type-descriptor.fir +++ b/flang/test/Fir/ignore-missing-type-descriptor.fir @@ -17,6 +17,6 @@ func.func @test_embox(%addr: !fir.ref) { // CHECK-LABEL: define void @test_embox( // CHECK-SAME: ptr captures(none) %[[ADDR:.*]]) // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } -// CHECK-SAME: { ptr undef, i64 ptrtoint (ptr getelementptr (%some_not_mangled_type, ptr null, i32 1) to i64), +// CHECK-SAME: { ptr undef, i64 4, // CHECK-SAME: i32 20240719, i8 0, i8 42, i8 0, i8 1, ptr null, [1 x i64] zeroinitializer }, // CHECK-SAME: ptr %[[ADDR]], 0 diff --git a/flang/test/Fir/polymorphic.fir b/flang/test/Fir/polymorphic.fir index c65841ea54cee..ea1099af6b988 100644 --- a/flang/test/Fir/polymorphic.fir +++ b/flang/test/Fir/polymorphic.fir @@ -64,7 +64,7 @@ func.func @_QMpolymorphic_testPtest_embox() { // CHECK-LABEL: @_QMpolymorphic_testPtest_embox() // CHECK: %[[ALLOCA_DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } { ptr @_QFEy, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, {{.*}}, ptr %[[ALLOCA_DESC]] +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } { ptr @_QFEy, i64 4, i32 20240719, i8 1, i8 9, {{.*}}, ptr %[[ALLOCA_DESC]] // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr @_QFEx, ptr %[[ALLOCA_DESC]], i32 64, i1 false) // Test emboxing of an array element from an unlimited polymorphic array. @@ -155,7 +155,7 @@ func.func @_QQmain() { // CHECK-LABEL: define void @_QQmain(){{.*}}{ // CHECK: %[[CLASS_NONE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } // CHECK: %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr @_QMmod1Ea, i64 ptrtoint (ptr getelementptr (%_QMmod1TtK2, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 1, i8 1, ptr @_QMmod1EXdtXtX2, [1 x i64] zeroinitializer }, ptr %[[CLASS_NONE]], align 8 +// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr @_QMmod1Ea, i64 8, i32 20240719, i8 0, i8 42, i8 1, i8 1, ptr @_QMmod1EXdtXtX2, [1 x i64] zeroinitializer }, ptr %[[CLASS_NONE]], align 8 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[DESC]], ptr %[[CLASS_NONE]], i32 40, i1 false) // CHECK: call void @_QMmod1Psub1(ptr %[[DESC]]) @@ -193,4 +193,4 @@ func.func @_QQembox_input_type(%arg0 : !fir.ref>> { fir.has_value %2 : !fir.box>> } // CHECK-LABEL: @pointer_char4_init -// CHECK-SAME: { ptr @char4, i64 ptrtoint (ptr getelementptr ([10 x i32], ptr null, i32 1) to i64), i32 20240719, i8 0, i8 44, i8 1, i8 0 } +// CHECK-SAME: { ptr @char4, i64 40, i32 20240719, i8 0, i8 44, i8 1, i8 0 } diff --git a/flang/test/Fir/rebox.fir b/flang/test/Fir/rebox.fir index 91c1e9da5d454..140308be6a814 100644 --- a/flang/test/Fir/rebox.fir +++ b/flang/test/Fir/rebox.fir @@ -25,7 +25,7 @@ func.func @test_rebox_1(%arg0: !fir.box>) { // CHECK: %[[EXTRA_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 6 // CHECK: %[[EXTRA:.*]] = load i8, ptr %[[EXTRA_GEP]] // CHECK: %[[EXTRA_WITH_ADDENDUM_CORRECTION:.*]] = and i8 %[[EXTRA]] - // CHECK: %[[OUTBOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 27, i8 0, i8 undef, [1 x [3 x i64]] undef }, i8 %[[EXTRA_WITH_ADDENDUM_CORRECTION]], 6 + // CHECK: %[[OUTBOX0:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 4, i32 20240719, i8 1, i8 27, i8 0, i8 undef, [1 x [3 x i64]] undef }, i8 %[[EXTRA_WITH_ADDENDUM_CORRECTION]], 6 // CHECK: %[[INSTRIDE_0_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i32 0, i32 2 // CHECK: %[[INSTRIDE_0:.*]] = load i64, ptr %[[INSTRIDE_0_GEP]] // CHECK: %[[INSTRIDE_1_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 7, i32 1, i32 2 @@ -63,7 +63,7 @@ func.func @test_rebox_2(%arg0: !fir.box>>) { // CHECK: %[[OUTBOX:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } // CHECK: %[[LEN_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[INBOX]], i32 0, i32 1 // CHECK: %[[LEN:.*]] = load i64, ptr %[[LEN_GEP]] - // CHECK: %[[SIZE:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[LEN]] + // CHECK: %[[SIZE:.*]] = mul i64 1, %[[LEN]] // CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } undef, i64 %[[SIZE]], 1 %1 = fir.rebox %arg0 [%0] : (!fir.box>>, !fir.slice<2>) -> !fir.box>> diff --git a/flang/test/Fir/target-rewrite-indirect-calls.fir b/flang/test/Fir/target-rewrite-indirect-calls.fir new file mode 100644 index 0000000000000..dbb3d0823520c --- /dev/null +++ b/flang/test/Fir/target-rewrite-indirect-calls.fir @@ -0,0 +1,22 @@ +// Test that ABI attributes are set in indirect calls to BIND(C) functions. +// RUN: fir-opt --target-rewrite="target=x86_64-unknown-linux-gnu" %s | FileCheck %s + +func.func @test(%arg0: () -> (), %arg1: !fir.ref}>>, %arg2: f64) { + %0 = fir.load %arg1 : !fir.ref}>> + %1 = fir.convert %arg0 : (() -> ()) -> ((!fir.type}>, f64) -> ()) + fir.call %1(%0, %arg2) proc_attrs : (!fir.type}>, f64) -> () + return +} +// CHECK-LABEL: func.func @test( +// CHECK-SAME: %[[VAL_0:.*]]: () -> (), +// CHECK-SAME: %[[VAL_1:.*]]: !fir.ref}>>, +// CHECK-SAME: %[[VAL_2:.*]]: f64) { +// CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref}>> +// CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_0]] : (() -> ()) -> ((!fir.ref}>>, f64) -> ()) +// CHECK: %[[VAL_5:.*]] = llvm.intr.stacksave : !llvm.ptr +// CHECK: %[[VAL_6:.*]] = fir.alloca !fir.type}> +// CHECK: fir.store %[[VAL_3]] to %[[VAL_6]] : !fir.ref}>> +// CHECK: fir.call %[[VAL_4]](%[[VAL_6]], %[[VAL_2]]) : (!fir.ref}>> {llvm.byval = !fir.type}>}, f64) -> () +// CHECK: llvm.intr.stackrestore %[[VAL_5]] : !llvm.ptr +// CHECK: return +// CHECK: } diff --git a/flang/test/Fir/type-descriptor.fir b/flang/test/Fir/type-descriptor.fir index 3b58a2f68251a..ab48caeb4d199 100644 --- a/flang/test/Fir/type-descriptor.fir +++ b/flang/test/Fir/type-descriptor.fir @@ -13,7 +13,7 @@ fir.global internal @_QFfooEx : !fir.box> { fir.has_value %1 : !fir.box> } // CHECK: @_QFfooEx = internal global { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } -// CHECK-SAME: { ptr null, i64 ptrtoint (ptr getelementptr (%_QFfooTsometype, ptr null, i32 1) to i64), +// CHECK-SAME: { ptr null, i64 80, // CHECK-SAME: i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QFfooEXdtXsometype, [1 x i64] zeroinitializer } !some_pdt_type = !fir.type<_QFfooTsome_pdt_typeK42K43{num:i32,values:!fir.box>>}> @@ -25,5 +25,5 @@ fir.global internal @_QFfooEx2 : !fir.box> { fir.has_value %1 : !fir.box> } // CHECK: @_QFfooEx2 = internal global { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } -// CHECK-SAME: { ptr null, i64 ptrtoint (ptr getelementptr (%_QFfooTsome_pdt_typeK42K43, ptr null, i32 1) to i64), +// CHECK-SAME: { ptr null, i64 80, // CHECK-SAME: i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QFfooEXdtXsome_pdt_typeX42X43, [1 x i64] zeroinitializer } diff --git a/flang/test/Integration/OpenMP/map-types-and-sizes.f90 b/flang/test/Integration/OpenMP/map-types-and-sizes.f90 index 4ff338e79aecc..e0221ef254192 100644 --- a/flang/test/Integration/OpenMP/map-types-and-sizes.f90 +++ b/flang/test/Integration/OpenMP/map-types-and-sizes.f90 @@ -814,6 +814,6 @@ end subroutine mapType_common_block_members !CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_ptrs, i32 0, i32 0 !CHECK: store ptr @var_common_, ptr %[[OFFLOAD_PTR_ARR]], align 8 !CHECK: %[[BASE_PTR_ARR_1:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_baseptrs, i32 0, i32 1 -!CHECK: store ptr getelementptr (i8, ptr @var_common_, i64 4), ptr %[[BASE_PTR_ARR_1]], align 8 +!CHECK: store ptr getelementptr inbounds nuw (i8, ptr @var_common_, i64 4), ptr %[[BASE_PTR_ARR_1]], align 8 !CHECK: %[[OFFLOAD_PTR_ARR_1:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_ptrs, i32 0, i32 1 -!CHECK: store ptr getelementptr (i8, ptr @var_common_, i64 4), ptr %[[OFFLOAD_PTR_ARR_1]], align 8 +!CHECK: store ptr getelementptr inbounds nuw (i8, ptr @var_common_, i64 4), ptr %[[OFFLOAD_PTR_ARR_1]], align 8 diff --git a/flang/test/Integration/OpenMP/private-global.f90 b/flang/test/Integration/OpenMP/private-global.f90 index 39d7e2274cff9..1aacfb4c87198 100644 --- a/flang/test/Integration/OpenMP/private-global.f90 +++ b/flang/test/Integration/OpenMP/private-global.f90 @@ -33,12 +33,12 @@ program bug ! CHECK: %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 ! CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[INTERMEDIATE]], ptr %[[PRIV_BOX_ALLOC]], i32 48, i1 false) ! CHECK: store i32 50, ptr %[[FIFTY]], align 4 -! CHECK: %[[FIFTY_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 9, i8 0, i8 0 }, ptr %[[FIFTY]], 0 +! CHECK: %[[FIFTY_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 4, i32 20240719, i8 0, i8 9, i8 0, i8 0 }, ptr %[[FIFTY]], 0 ! CHECK: store { ptr, i64, i32, i8, i8, i8, i8 } %[[FIFTY_BOX_VAL]], ptr %[[BOXED_FIFTY]], align 8 ! CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[TABLE_BOX_ADDR2]], ptr %[[INTERMEDIATE]], i32 48, i1 false) ! CHECK: call void @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9) ! CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[TABLE_BOX_ADDR]], ptr %[[PRIV_BOX_ALLOC]], i32 48, i1 false) -! CHECK: %[[PRIV_TABLE:.*]] = call ptr @malloc(i64 ptrtoint (ptr getelementptr ([10 x i32], ptr null, i32 1) to i64)) +! CHECK: %[[PRIV_TABLE:.*]] = call ptr @malloc(i64 40) ! ... ! check that we use the private copy of table for table/=50 ! CHECK: omp.par.region3: diff --git a/flang/test/Integration/abi-indirect-call.f90 b/flang/test/Integration/abi-indirect-call.f90 new file mode 100644 index 0000000000000..54a6adfb2c14a --- /dev/null +++ b/flang/test/Integration/abi-indirect-call.f90 @@ -0,0 +1,15 @@ +!REQUIRES: x86-registered-target +!REQUIRES: flang-supports-f128-math +!RUN: %flang_fc1 -emit-llvm -triple x86_64-unknown-linux-gnu %s -o - | FileCheck %s + +! Test ABI of indirect calls is properly implemented in the LLVM IR. + +subroutine foo(func_ptr, z) + interface + complex(16) function func_ptr() + end function + end interface + complex(16) :: z + ! CHECK: call void %{{.*}}(ptr sret({ fp128, fp128 }) align 16 %{{.*}}) + z = func_ptr() +end subroutine diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90 index d65ca538bf60c..f77aefcc2c314 100644 --- a/flang/test/Lower/OpenACC/acc-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-loop.f90 @@ -321,8 +321,8 @@ subroutine sub1(i, j, k) ! CHECK: %[[DC_K:.*]] = fir.alloca i32 {bindc_name = "k"} ! CHECK: %[[DC_J:.*]] = fir.alloca i32 {bindc_name = "j"} ! CHECK: %[[DC_I:.*]] = fir.alloca i32 {bindc_name = "i"} -! CHECK: %[[P_I:.*]] = acc.private varPtr(%[[DC_I]] : !fir.ref) -> !fir.ref {implicit = true, name = ""} -! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]] : !fir.ref) -> !fir.ref {implicit = true, name = ""} -! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]] : !fir.ref) -> !fir.ref {implicit = true, name = ""} +! CHECK: %[[P_I:.*]] = acc.private varPtr(%[[DC_I]] : !fir.ref) -> !fir.ref {implicit = true, name = "i"} +! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]] : !fir.ref) -> !fir.ref {implicit = true, name = "j"} +! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]] : !fir.ref) -> !fir.ref {implicit = true, name = "k"} ! CHECK: acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[P_I]] : !fir.ref, @privatization_ref_i32 -> %[[P_J]] : !fir.ref, @privatization_ref_i32 -> %[[P_K]] : !fir.ref) control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%c10{{.*}}, %c100{{.*}}, %c200{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) ! CHECK: } attributes {inclusiveUpperbound = array} diff --git a/flang/test/Lower/OpenACC/acc-private-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-private-unwrap-defaultbounds.f90 index febb933e98975..d86f82dae8d03 100644 --- a/flang/test/Lower/OpenACC/acc-private-unwrap-defaultbounds.f90 +++ b/flang/test/Lower/OpenACC/acc-private-unwrap-defaultbounds.f90 @@ -396,7 +396,7 @@ subroutine acc_private_use() ! CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFacc_private_useEi"} ! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %0 {uniq_name = "_QFacc_private_useEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: acc.parallel -! CHECK: %[[PRIV_I:.*]] = acc.private varPtr(%[[DECL_I]]#1 : !fir.ref) -> !fir.ref {implicit = true, name = ""} +! CHECK: %[[PRIV_I:.*]] = acc.private varPtr(%[[DECL_I]]#1 : !fir.ref) -> !fir.ref {implicit = true, name = "i"} ! CHECK: %[[DECL_PRIV_I:.*]]:2 = hlfir.declare %[[PRIV_I]] {uniq_name = "_QFacc_private_useEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: acc.loop {{.*}} private(@privatization_ref_i32 -> %[[PRIV_I]] : !fir.ref) control(%[[IV0:.*]] : i32) = (%c1{{.*}} : i32) to (%c10{{.*}} : i32) step (%c1{{.*}} : i32) ! CHECK: fir.store %[[IV0]] to %[[DECL_PRIV_I]]#0 : !fir.ref diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90 index 99e3b223c8575..c86da8c001b55 100644 --- a/flang/test/Lower/OpenACC/acc-private.f90 +++ b/flang/test/Lower/OpenACC/acc-private.f90 @@ -384,7 +384,7 @@ subroutine acc_private_use() ! CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFacc_private_useEi"} ! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %0 {uniq_name = "_QFacc_private_useEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: acc.parallel -! CHECK: %[[PRIV_I:.*]] = acc.private varPtr(%[[DECL_I]]#1 : !fir.ref) -> !fir.ref {implicit = true, name = ""} +! CHECK: %[[PRIV_I:.*]] = acc.private varPtr(%[[DECL_I]]#1 : !fir.ref) -> !fir.ref {implicit = true, name = "i"} ! CHECK: %[[DECL_PRIV_I:.*]]:2 = hlfir.declare %[[PRIV_I]] {uniq_name = "_QFacc_private_useEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: acc.loop {{.*}} private(@privatization_ref_i32 -> %[[PRIV_I]] : !fir.ref) control(%[[IV0:.*]] : i32) = (%c1{{.*}} : i32) to (%c10{{.*}} : i32) step (%c1{{.*}} : i32) ! CHECK: fir.store %[[IV0]] to %[[DECL_PRIV_I]]#0 : !fir.ref diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 index 44cb08e029aa1..d8403fbbaa510 100644 --- a/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 +++ b/flang/test/Lower/OpenMP/parallel-private-clause-str.f90 @@ -8,6 +8,14 @@ ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 \ ! RUN: | FileCheck %s +!CHECK: omp.private {type = private} @{{.*}}test_allocatable_fixed_len_stringEfixed_len_str{{.*}} init { +!CHECK: fir.if {{.*}} { +!CHECK: fir.embox %{{[^[:space:]]+}} : {{.*}} +!CHECK: } else { +!CHECK: fir.embox %{{[^[:space:]]+}} : {{.*}} +!CHECK: } +!CHECK: } + !CHECK: omp.private {type = private} @[[STR_ARR_PRIVATIZER:_QFtest_allocatable_string_arrayEc_private_box_heap_Uxc8xU]] : [[TYPE:.*]] init { !CHECK: ^bb0(%[[ORIG_REF:.*]]: !fir.ref<[[TYPE]]>, %[[C_PVT_BOX_REF:.*]]: !fir.ref<[[TYPE]]>): !CHECK: %{{.*}} = fir.load %[[ORIG_REF]] : !fir.ref>>>> @@ -73,3 +81,11 @@ subroutine test_allocatable_string_array(n) !$omp parallel private(c) !$omp end parallel end subroutine + +subroutine test_allocatable_fixed_len_string() + character(42), allocatable :: fixed_len_str + !$omp parallel do private(fixed_len_str) + do i = 1,10 + end do + !$omp end parallel do +end subroutine diff --git a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 index b3e25ae779561..be25169e7d83e 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 @@ -36,8 +36,7 @@ end subroutine proc !CHECK: [[MALLOC_BB]]: !CHECK-NOT: omp.par.{{.*}}: -!CHECK: call ptr @malloc -!CHECK-SAME: i64 10 +!CHECK: call ptr @malloc(i64 80) !CHECK: %[[RED_ARR_0:.*]] = getelementptr inbounds [2 x ptr], ptr %red.array, i64 0, i64 0 !CHECK: store ptr %[[F_priv]], ptr %[[RED_ARR_0:.*]] diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90 index db518c541918a..98873454a00ff 100644 --- a/flang/test/Lower/allocatable-polymorphic.f90 +++ b/flang/test/Lower/allocatable-polymorphic.f90 @@ -680,7 +680,7 @@ program test_alloc ! allocatable. ! LLVM-LABEL: define void @_QMpolyPtest_deallocate() -! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 ptrtoint (ptr getelementptr (%_QMpolyTp1, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyEXdtXp1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]] +! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 8, i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyEXdtXp1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]] ! LLVM: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA2:[0-9]+]], ptr %[[ALLOCA1]], i32 40, i1 false) ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyEXdtXp1, i32 0, i32 0) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) diff --git a/flang/test/Lower/common-block.f90 b/flang/test/Lower/common-block.f90 index b5c1389df45d3..5e32c119bcf21 100644 --- a/flang/test/Lower/common-block.f90 +++ b/flang/test/Lower/common-block.f90 @@ -14,7 +14,7 @@ subroutine s0 common // a0, b0 - ! CHECK: call void @_QPs(ptr @__BLNK__, ptr getelementptr (i8, ptr @__BLNK__, i64 4)) + ! CHECK: call void @_QPs(ptr @__BLNK__, ptr getelementptr inbounds nuw (i8, ptr @__BLNK__, i64 4)) call s(a0, b0) end subroutine s0 @@ -23,7 +23,7 @@ subroutine s1 common /x/ a1, b1 data a1 /1.0/, b1 /2.0/ - ! CHECK: call void @_QPs(ptr @x_, ptr getelementptr (i8, ptr @x_, i64 4)) + ! CHECK: call void @_QPs(ptr @x_, ptr getelementptr inbounds nuw (i8, ptr @x_, i64 4)) call s(a1, b1) end subroutine s1 @@ -31,7 +31,7 @@ end subroutine s1 subroutine s2 common /y/ a2, b2, c2 - ! CHECK: call void @_QPs(ptr @y_, ptr getelementptr (i8, ptr @y_, i64 4)) + ! CHECK: call void @_QPs(ptr @y_, ptr getelementptr inbounds nuw (i8, ptr @y_, i64 4)) call s(a2, b2) end subroutine s2 @@ -58,7 +58,7 @@ subroutine s4 use mod_with_common ! CHECK: load i32, ptr @c_in_mod_ print *, i - ! CHECK: load i32, ptr getelementptr (i8, ptr @c_in_mod_, i64 4) + ! CHECK: load i32, ptr getelementptr inbounds nuw (i8, ptr @c_in_mod_, i64 4) print *, j end subroutine s4 diff --git a/flang/test/Lower/forall/character-1.f90 b/flang/test/Lower/forall/character-1.f90 index e97c3f36b0b10..d5f968ba93450 100644 --- a/flang/test/Lower/forall/character-1.f90 +++ b/flang/test/Lower/forall/character-1.f90 @@ -23,7 +23,7 @@ end program test ! CHECK: %[[extval:.*]] = load i64, ptr %[[extent]] ! CHECK: %[[elesize:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 1 ! CHECK: %[[esval:.*]] = load i64, ptr %[[elesize]] -! CHECK: %[[mul:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[esval]] +! CHECK: %[[mul:.*]] = mul i64 1, %[[esval]] ! CHECK: %[[mul2:.*]] = mul i64 %[[mul]], %[[extval]] ! CHECK: %[[buff:.*]] = call ptr @malloc(i64 %[[mul2]]) ! CHECK: %[[to:.*]] = getelementptr i8, ptr %[[buff]], i64 % diff --git a/flang/test/Lower/real-descriptors.f90 b/flang/test/Lower/real-descriptors.f90 index eb1c4dfae5fd6..c38d4f198dfb7 100644 --- a/flang/test/Lower/real-descriptors.f90 +++ b/flang/test/Lower/real-descriptors.f90 @@ -15,12 +15,12 @@ subroutine test_reals(x2, x3, x4, x8, c2, c3, c4, c8) complex(kind=8) :: c8 read(in,*) x2 - ! CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (half, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 25, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 + ! CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 2, i32 {{[0-9]*}}, i8 0, i8 25, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 ! CHECK: call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}}) print "(z4)", x2 read(in,*) x3 - ! CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (bfloat, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 26, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 + ! CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 2, i32 {{[0-9]*}}, i8 0, i8 26, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 ! CHECK: call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}}) print "(z4)", x3 @@ -33,12 +33,12 @@ subroutine test_reals(x2, x3, x4, x8, c2, c3, c4, c8) print "(z16)", x8 read(in,*) c2 - ! CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr ({ half, half }, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 32, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 + ! CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 4, i32 {{[0-9]*}}, i8 0, i8 32, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 ! CHECK: call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}}) print "(z4,' ',z4)", c2 read(in,*) c3 - ! CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr ({ bfloat, bfloat }, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 33, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 + ! CHECK: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 4, i32 {{[0-9]*}}, i8 0, i8 33, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 ! CHECK: call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}}) print "(z4,' ',z4)", c3 @@ -60,12 +60,12 @@ subroutine test_kind10(x10, c10) complex(kind=kind10) :: c10 read(in,*) x10 - ! CHECK-KIND10: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (x86_fp80, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 29, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 + ! CHECK-KIND10: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 16, i32 {{[0-9]*}}, i8 0, i8 29, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 ! CHECK-KIND10: call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}}) print "(z20)", x10 read(in,*) c10 - ! CHECK-KIND10: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr ({ x86_fp80, x86_fp80 }, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 36, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 + ! CHECK-KIND10: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 32, i32 {{[0-9]*}}, i8 0, i8 36, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 ! CHECK-KIND10: call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}}) print "(z20,' ',z20)", c10 end subroutine @@ -78,12 +78,12 @@ subroutine test_kind16(x16, c16) complex(kind=kind16) :: c16 read(in,*) x16 - ! CHECK-KIND16: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr (fp128, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 31, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 + ! CHECK-KIND16: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 16, i32 {{[0-9]*}}, i8 0, i8 31, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 ! CHECK-KIND16: call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}}) print "(z32)", x16 read(in,*) c16 - ! CHECK-KIND16: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 ptrtoint (ptr getelementptr ({ fp128, fp128 }, ptr null, i32 1) to i64), i32 {{[0-9]*}}, i8 0, i8 38, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 + ! CHECK-KIND16: insertvalue { ptr, i64, i32, i8, i8, i8, i8 } { ptr undef, i64 32, i32 {{[0-9]*}}, i8 0, i8 38, i8 0, i8 0 }, ptr %{{[0-9]*}}, 0 ! CHECK-KIND16: call i1 @_FortranAioOutputDescriptor(ptr %{{[0-9]*}}, ptr %{{[0-9]*}}) print "(z32,' ',z32)", c16 end subroutine diff --git a/flang/test/Semantics/OpenMP/nested-target.f90 b/flang/test/Semantics/OpenMP/nested-target.f90 index f42b5dde6a08d..6a56a84f4f570 100644 --- a/flang/test/Semantics/OpenMP/nested-target.f90 +++ b/flang/test/Semantics/OpenMP/nested-target.f90 @@ -54,6 +54,7 @@ program main n2 = 10 !$omp target teams map(to:a) !PORTABILITY: If TARGET DATA directive is nested inside TARGET region, the behaviour is unspecified + !ERROR: Only `DISTRIBUTE`, `PARALLEL`, or `LOOP` regions are allowed to be strictly nested inside `TEAMS` region. !$omp target data map(n1,n2) do i=1, n1 do j=1, n2 @@ -65,6 +66,7 @@ program main !$omp target teams map(to:a) map(from:n1,n2) !PORTABILITY: If TARGET TEAMS DISTRIBUTE PARALLEL DO directive is nested inside TARGET region, the behaviour is unspecified + !ERROR: Only `DISTRIBUTE`, `PARALLEL`, or `LOOP` regions are allowed to be strictly nested inside `TEAMS` region. !$omp target teams distribute parallel do do i=1, n1 do j=1, n2 diff --git a/flang/test/Semantics/OpenMP/nested-teams.f90 b/flang/test/Semantics/OpenMP/nested-teams.f90 index b1a7c92a6906b..974172ee97175 100644 --- a/flang/test/Semantics/OpenMP/nested-teams.f90 +++ b/flang/test/Semantics/OpenMP/nested-teams.f90 @@ -68,6 +68,7 @@ program main !$omp end target !$omp target teams + !ERROR: Only `DISTRIBUTE`, `PARALLEL`, or `LOOP` regions are allowed to be strictly nested inside `TEAMS` region. !ERROR: TEAMS region can only be strictly nested within the implicit parallel region or TARGET region !$omp teams a = 3.14 diff --git a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp index 5c14809a265e1..90aabd7d40d44 100644 --- a/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp +++ b/flang/test/lib/OpenACC/TestOpenACCInterfaces.cpp @@ -32,15 +32,43 @@ struct TestFIROpenACCInterfaces mlir::OpBuilder builder(mod); getOperation().walk([&](Operation *op) { if (isa(op)) { - Type typeOfVar = acc::getVar(op).getType(); - llvm::errs() << "Visiting: " << *op << "\n"; + Value var = acc::getVar(op); + Type typeOfVar = var.getType(); + + // Attempt to determine if the variable is mappable-like or if + // the pointee itself is mappable-like. For example, if the variable is + // of type !fir.ref>, we want to print both the details about + // the !fir.ref since it is pointer-like, and about !fir.box since it + // is mappable. auto mappableTy = dyn_cast_if_present(typeOfVar); if (!mappableTy) { mappableTy = dyn_cast_if_present(acc::getVarType(op)); } + + llvm::errs() << "Visiting: " << *op << "\n"; + llvm::errs() << "\tVar: " << var << "\n"; + + if (auto ptrTy = dyn_cast_if_present(typeOfVar)) { + llvm::errs() << "\tPointer-like: " << typeOfVar << "\n"; + // If the pointee is not mappable, print details about it. Otherwise, + // we defer to the mappable printing below to print those details. + if (!mappableTy) { + acc::VariableTypeCategory typeCategory = + ptrTy.getPointeeTypeCategory( + cast>(var), + acc::getVarType(op)); + llvm::errs() << "\t\tType category: " << typeCategory << "\n"; + } + } + if (mappableTy) { llvm::errs() << "\tMappable: " << mappableTy << "\n"; + + acc::VariableTypeCategory typeCategory = + mappableTy.getTypeCategory(var); + llvm::errs() << "\t\tType category: " << typeCategory << "\n"; + if (datalayout.has_value()) { auto size = mappableTy.getSizeInBytes( acc::getVar(op), acc::getBounds(op), datalayout.value()); @@ -61,10 +89,6 @@ struct TestFIROpenACCInterfaces llvm::errs() << "\t\tBound[" << idx << "]: " << bound << "\n"; } } - } else { - assert(acc::isPointerLikeType(typeOfVar) && - "expected to be pointer-like"); - llvm::errs() << "\tPointer-like: " << typeOfVar << "\n"; } } }); diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index 694cd7b1993ca..351f727389e3a 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -469,6 +469,18 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) libc.src.stdfix.ukbits libc.src.stdfix.lkbits libc.src.stdfix.ulkbits + libc.src.stdfix.countlshr + libc.src.stdfix.countlsr + libc.src.stdfix.countlslr + libc.src.stdfix.countlshk + libc.src.stdfix.countlsk + libc.src.stdfix.countlslk + libc.src.stdfix.countlsuhr + libc.src.stdfix.countlsur + libc.src.stdfix.countlsulr + libc.src.stdfix.countlsuhk + libc.src.stdfix.countlsuk + libc.src.stdfix.countlsulk ) endif() diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt index 667ab40dca999..39c70a22a21e0 100644 --- a/libc/config/baremetal/riscv/entrypoints.txt +++ b/libc/config/baremetal/riscv/entrypoints.txt @@ -464,6 +464,18 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) libc.src.stdfix.ukbits libc.src.stdfix.lkbits libc.src.stdfix.ulkbits + libc.src.stdfix.countlshr + libc.src.stdfix.countlsr + libc.src.stdfix.countlslr + libc.src.stdfix.countlshk + libc.src.stdfix.countlsk + libc.src.stdfix.countlslk + libc.src.stdfix.countlsuhr + libc.src.stdfix.countlsur + libc.src.stdfix.countlsulr + libc.src.stdfix.countlsuhk + libc.src.stdfix.countlsuk + libc.src.stdfix.countlsulk ) endif() diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 6e67ea559d57b..a9ba0c257755b 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -749,6 +749,18 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) # TODO: https://github.com/llvm/llvm-project/issues/115778 libc.src.stdfix.lkbits libc.src.stdfix.ulkbits + libc.src.stdfix.countlshr + libc.src.stdfix.countlsr + libc.src.stdfix.countlslr + libc.src.stdfix.countlshk + libc.src.stdfix.countlsk + libc.src.stdfix.countlslk + libc.src.stdfix.countlsuhr + libc.src.stdfix.countlsur + libc.src.stdfix.countlsulr + libc.src.stdfix.countlsuhk + libc.src.stdfix.countlsuk + libc.src.stdfix.countlsulk ) endif() diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 0a942516db6c3..76e593296a4ea 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -875,6 +875,18 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) libc.src.stdfix.ukbits libc.src.stdfix.lkbits libc.src.stdfix.ulkbits + libc.src.stdfix.countlshr + libc.src.stdfix.countlsr + libc.src.stdfix.countlslr + libc.src.stdfix.countlshk + libc.src.stdfix.countlsk + libc.src.stdfix.countlslk + libc.src.stdfix.countlsuhr + libc.src.stdfix.countlsur + libc.src.stdfix.countlsulr + libc.src.stdfix.countlsuhk + libc.src.stdfix.countlsuk + libc.src.stdfix.countlsulk ) endif() diff --git a/libc/docs/headers/math/stdfix.rst b/libc/docs/headers/math/stdfix.rst index 58052f000995c..4507f2b608bf1 100644 --- a/libc/docs/headers/math/stdfix.rst +++ b/libc/docs/headers/math/stdfix.rst @@ -73,7 +73,7 @@ The following functions are included in the ISO/IEC TR 18037:2008 standard. +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ | \*bits | | | | | | | | | | | | | +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ -| countls | | | | | | | | | | | | | +| countls | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ | divi | | | | | | | | | | | | | +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h index d3f8a4e63268a..a0fa506c01ab8 100644 --- a/libc/include/__llvm-libc-common.h +++ b/libc/include/__llvm-libc-common.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIBC_COMMON_H #define LLVM_LIBC_COMMON_H +#define __LLVM_LIBC__ 1 + #ifdef __cplusplus #undef __BEGIN_C_DECLS diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt index ea892a87dbe7a..8c1f7387f3b4d 100644 --- a/libc/include/llvm-libc-macros/CMakeLists.txt +++ b/libc/include/llvm-libc-macros/CMakeLists.txt @@ -314,6 +314,8 @@ add_macro_header( endian_macros HDR endian-macros.h + DEPENDS + .stdint_macros ) add_macro_header( @@ -326,6 +328,8 @@ add_macro_header( pthread_macros HDR pthread-macros.h + DEPENDS + .null_macro ) add_macro_header( diff --git a/libc/include/llvm-libc-macros/endian-macros.h b/libc/include/llvm-libc-macros/endian-macros.h index 94e1d60f8ff40..e1e105d50c1c6 100644 --- a/libc/include/llvm-libc-macros/endian-macros.h +++ b/libc/include/llvm-libc-macros/endian-macros.h @@ -9,8 +9,42 @@ #ifndef LLVM_LIBC_MACROS_ENDIAN_MACROS_H #define LLVM_LIBC_MACROS_ENDIAN_MACROS_H +#include "stdint-macros.h" + #define LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__ #define BIG_ENDIAN __ORDER_BIG_ENDIAN__ #define BYTE_ORDER __BYTE_ORDER__ +#if BYTE_ORDER == LITTLE_ENDIAN + +#define htobe16(x) __builtin_bswap16((x)) +#define htobe32(x) __builtin_bswap32((x)) +#define htobe64(x) __builtin_bswap64((x)) +#define htole16(x) ((uint16_t)(x)) +#define htole32(x) ((uint32_t)(x)) +#define htole64(x) ((uint64_t)(x)) +#define be16toh(x) __builtin_bswap16((x)) +#define be32toh(x) __builtin_bswap32((x)) +#define be64toh(x) __builtin_bswap64((x)) +#define le16toh(x) ((uint16_t)(x)) +#define le32toh(x) ((uint32_t)(x)) +#define le64toh(x) ((uint64_t)(x)) + +#else + +#define htobe16(x) ((uint16_t)(x)) +#define htobe32(x) ((uint32_t)(x)) +#define htobe64(x) ((uint64_t)(x)) +#define htole16(x) __builtin_bswap16((x)) +#define htole32(x) __builtin_bswap32((x)) +#define htole64(x) __builtin_bswap64((x)) +#define be16toh(x) ((uint16_t)(x)) +#define be32toh(x) ((uint32_t)(x)) +#define be64toh(x) ((uint64_t)(x)) +#define le16toh(x) __builtin_bswap16((x)) +#define le32toh(x) __builtin_bswap32((x)) +#define le64toh(x) __builtin_bswap64((x)) + +#endif + #endif // LLVM_LIBC_MACROS_ENDIAN_MACROS_H diff --git a/libc/include/llvm-libc-macros/features-macros.h b/libc/include/llvm-libc-macros/features-macros.h index 5bc87a68fc0ba..f87ae4ad12408 100644 --- a/libc/include/llvm-libc-macros/features-macros.h +++ b/libc/include/llvm-libc-macros/features-macros.h @@ -9,6 +9,4 @@ #ifndef LLVM_LIBC_MACROS_FEATURES_MACROS_H #define LLVM_LIBC_MACROS_FEATURES_MACROS_H -#define __LLVM_LIBC__ 1 - #endif // LLVM_LIBC_MACROS_FEATURES_MACROS_H diff --git a/libc/include/llvm-libc-macros/pthread-macros.h b/libc/include/llvm-libc-macros/pthread-macros.h index 8a144dbd2e611..fcc6ef925e3f4 100644 --- a/libc/include/llvm-libc-macros/pthread-macros.h +++ b/libc/include/llvm-libc-macros/pthread-macros.h @@ -9,6 +9,8 @@ #ifndef LLVM_LIBC_MACROS_PTHREAD_MACRO_H #define LLVM_LIBC_MACROS_PTHREAD_MACRO_H +#include "null-macro.h" + #define PTHREAD_CREATE_JOINABLE 0 #define PTHREAD_CREATE_DETACHED 1 @@ -25,8 +27,34 @@ #define PTHREAD_PROCESS_PRIVATE 0 #define PTHREAD_PROCESS_SHARED 1 -#define PTHREAD_MUTEX_INITIALIZER {0} -#define PTHREAD_RWLOCK_INITIALIZER {0} +#ifdef __linux__ +#define PTHREAD_MUTEX_INITIALIZER \ + { \ + /* .__timed = */ 0, /* .__recursive = */ 0, \ + /* .__robust = */ 0, /* .__owner = */ NULL, \ + /* .__lock_count = */ 0, /* .__futex_word = */ {0}, \ + } +#else +#define PTHREAD_MUTEX_INITIALIZER \ + { \ + /* .__timed = */ 0, /* .__recursive = */ 0, \ + /* .__robust = */ 0, /* .__owner = */ NULL, \ + /* .__lock_count = */ 0, \ + } +#endif + +#define PTHREAD_RWLOCK_INITIALIZER \ + { \ + /* .__is_pshared = */ 0, \ + /* .__preference = */ 0, \ + /* .__state = */ 0, \ + /* .__write_tid = */ 0, \ + /* .__wait_queue_mutex = */ {0}, \ + /* .__pending_readers = */ {0}, \ + /* .__pending_writers = */ {0}, \ + /* .__reader_serialization = */ {0}, \ + /* .__writer_serialization = */ {0}, \ + } // glibc extensions #define PTHREAD_STACK_MIN (1 << 14) // 16KB diff --git a/libc/include/llvm-libc-types/struct_tm.h b/libc/include/llvm-libc-types/struct_tm.h index 9fef7c5718ea4..2ec74ecac0293 100644 --- a/libc/include/llvm-libc-types/struct_tm.h +++ b/libc/include/llvm-libc-types/struct_tm.h @@ -19,6 +19,7 @@ struct tm { int tm_wday; // days since Sunday int tm_yday; // days since January int tm_isdst; // Daylight Saving Time flag + // TODO: add tm_gmtoff and tm_zone? (posix extensions) }; #endif // LLVM_LIBC_TYPES_STRUCT_TM_H diff --git a/libc/include/stdfix.yaml b/libc/include/stdfix.yaml index 9663ac0c7df4d..0abf2f3a9b3b6 100644 --- a/libc/include/stdfix.yaml +++ b/libc/include/stdfix.yaml @@ -306,3 +306,87 @@ functions: arguments: - type: unsigned int guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlshr + standards: + - stdc_ext + return_type: int + arguments: + - type: short fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsr + standards: + - stdc_ext + return_type: int + arguments: + - type: fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlslr + standards: + - stdc_ext + return_type: int + arguments: + - type: long fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlshk + standards: + - stdc_ext + return_type: int + arguments: + - type: short accum + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsk + standards: + - stdc_ext + return_type: int + arguments: + - type: accum + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlslk + standards: + - stdc_ext + return_type: int + arguments: + - type: long accum + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsuhr + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned short fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsur + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsulr + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned long fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsuhk + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned short accum + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsuk + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned accum + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsulk + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned long accum + guard: LIBC_COMPILER_HAS_FIXED_POINT diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h index adcd0472747d0..82b9eb5128262 100644 --- a/libc/src/__support/CPP/bit.h +++ b/libc/src/__support/CPP/bit.h @@ -232,7 +232,7 @@ rotl(T value, int rotate) { return value; if (rotate < 0) return cpp::rotr(value, -rotate); - return (value << rotate) | (value >> (N - rotate)); + return static_cast((value << rotate) | (value >> (N - rotate))); } template @@ -244,7 +244,7 @@ rotr(T value, int rotate) { return value; if (rotate < 0) return cpp::rotl(value, -rotate); - return (value >> rotate) | (value << (N - rotate)); + return static_cast((value >> rotate) | (value << (N - rotate))); } // TODO: Do we need this function at all? How is it different from diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h index db3c2c8a3d7a6..b24ffd4aa456f 100644 --- a/libc/src/__support/FPUtil/double_double.h +++ b/libc/src/__support/FPUtil/double_double.h @@ -18,43 +18,52 @@ namespace LIBC_NAMESPACE_DECL { namespace fputil { -#define DEFAULT_DOUBLE_SPLIT 27 +template struct DefaultSplit; +template <> struct DefaultSplit { + static constexpr size_t VALUE = 12; +}; +template <> struct DefaultSplit { + static constexpr size_t VALUE = 27; +}; -using DoubleDouble = LIBC_NAMESPACE::NumberPair; +using DoubleDouble = NumberPair; +using FloatFloat = NumberPair; // The output of Dekker's FastTwoSum algorithm is correct, i.e.: // r.hi + r.lo = a + b exactly // and |r.lo| < eps(r.lo) // Assumption: |a| >= |b|, or a = 0. -template -LIBC_INLINE constexpr DoubleDouble exact_add(double a, double b) { - DoubleDouble r{0.0, 0.0}; +template +LIBC_INLINE constexpr NumberPair exact_add(T a, T b) { + NumberPair r{0.0, 0.0}; if constexpr (FAST2SUM) { r.hi = a + b; - double t = r.hi - a; + T t = r.hi - a; r.lo = b - t; } else { r.hi = a + b; - double t1 = r.hi - a; - double t2 = r.hi - t1; - double t3 = b - t1; - double t4 = a - t2; + T t1 = r.hi - a; + T t2 = r.hi - t1; + T t3 = b - t1; + T t4 = a - t2; r.lo = t3 + t4; } return r; } // Assumption: |a.hi| >= |b.hi| -LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, - const DoubleDouble &b) { - DoubleDouble r = exact_add(a.hi, b.hi); - double lo = a.lo + b.lo; +template +LIBC_INLINE constexpr NumberPair add(const NumberPair &a, + const NumberPair &b) { + NumberPair r = exact_add(a.hi, b.hi); + T lo = a.lo + b.lo; return exact_add(r.hi, r.lo + lo); } // Assumption: |a.hi| >= |b| -LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) { - DoubleDouble r = exact_add(a.hi, b); +template +LIBC_INLINE constexpr NumberPair add(const NumberPair &a, T b) { + NumberPair r = exact_add(a.hi, b); return exact_add(r.hi, r.lo + a.lo); } @@ -63,12 +72,12 @@ LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) { // Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed // Roundings," https://inria.hal.science/hal-04480440. // Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1. -template -LIBC_INLINE constexpr DoubleDouble split(double a) { - DoubleDouble r{0.0, 0.0}; +template ::VALUE> +LIBC_INLINE constexpr NumberPair split(T a) { + NumberPair r{0.0, 0.0}; // CN = 2^N. - constexpr double CN = static_cast(1 << N); - constexpr double C = CN + 1.0; + constexpr T CN = static_cast(1 << N); + constexpr T C = CN + 1.0; double t1 = C * a; double t2 = a - t1; r.hi = t1 + t2; @@ -77,16 +86,15 @@ LIBC_INLINE constexpr DoubleDouble split(double a) { } // Helper for non-fma exact mult where the first number is already split. -template -LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a, - double b) { - DoubleDouble bs = split(b); - DoubleDouble r{0.0, 0.0}; +template ::VALUE> +LIBC_INLINE NumberPair exact_mult(const NumberPair &as, T a, T b) { + NumberPair bs = split(b); + NumberPair r{0.0, 0.0}; r.hi = a * b; - double t1 = as.hi * bs.hi - r.hi; - double t2 = as.hi * bs.lo + t1; - double t3 = as.lo * bs.hi + t2; + T t1 = as.hi * bs.hi - r.hi; + T t2 = as.hi * bs.lo + t1; + T t3 = as.lo * bs.hi + t2; r.lo = as.lo * bs.lo + t3; return r; @@ -99,18 +107,18 @@ LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a, // Using Theorem 1 in the paper above, without FMA instruction, if we restrict // the generated constants to precision <= 51, and splitting it by 2^28 + 1, // then a * b = r.hi + r.lo is exact for all rounding modes. -template -LIBC_INLINE DoubleDouble exact_mult(double a, double b) { - DoubleDouble r{0.0, 0.0}; +template ::VALUE> +LIBC_INLINE NumberPair exact_mult(T a, T b) { + NumberPair r{0.0, 0.0}; #ifdef LIBC_TARGET_CPU_HAS_FMA r.hi = a * b; r.lo = fputil::multiply_add(a, b, -r.hi); #else // Dekker's Product. - DoubleDouble as = split(a); + NumberPair as = split(a); - r = exact_mult(as, a, b); + r = exact_mult(as, a, b); #endif // LIBC_TARGET_CPU_HAS_FMA return r; @@ -125,7 +133,7 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) { template LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a, const DoubleDouble &b) { - DoubleDouble r = exact_mult(a.hi, b.hi); + DoubleDouble r = exact_mult(a.hi, b.hi); double t1 = multiply_add(a.hi, b.lo, r.lo); double t2 = multiply_add(a.lo, b.hi, t1); r.lo = t2; @@ -157,19 +165,20 @@ LIBC_INLINE DoubleDouble multiply_add(const DoubleDouble &a, // rl = q * (ah - bh * rh) + q * (al - bl * rh) // as accurate as possible, then the error is bounded by: // |(ah + al) / (bh + bl) - (rh + rl)| < O(bl/bh) * (2^-52 + al/ah + bl/bh) -LIBC_INLINE DoubleDouble div(const DoubleDouble &a, const DoubleDouble &b) { - DoubleDouble r; - double q = 1.0 / b.hi; +template +LIBC_INLINE NumberPair div(const NumberPair &a, const NumberPair &b) { + NumberPair r; + T q = T(1) / b.hi; r.hi = a.hi * q; #ifdef LIBC_TARGET_CPU_HAS_FMA - double e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi); - double e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo); + T e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi); + T e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo); #else - DoubleDouble b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi); - DoubleDouble b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi); - double e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo; - double e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo; + NumberPair b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi); + NumberPair b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi); + T e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo; + T e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo; #endif // LIBC_TARGET_CPU_HAS_FMA r.lo = q * (e_hi + e_lo); diff --git a/libc/src/__support/fixed_point/CMakeLists.txt b/libc/src/__support/fixed_point/CMakeLists.txt index 3b744081765e4..b415e2c00c488 100644 --- a/libc/src/__support/fixed_point/CMakeLists.txt +++ b/libc/src/__support/fixed_point/CMakeLists.txt @@ -19,6 +19,7 @@ add_header_library( libc.src.__support.macros.optimization libc.src.__support.CPP.type_traits libc.src.__support.CPP.bit + libc.src.__support.CPP.limits libc.src.__support.math_extras ) diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h index 225ea417760a0..7509419da0c43 100644 --- a/libc/src/__support/fixed_point/fx_bits.h +++ b/libc/src/__support/fixed_point/fx_bits.h @@ -11,9 +11,10 @@ #include "include/llvm-libc-macros/stdfix-macros.h" #include "src/__support/CPP/bit.h" +#include "src/__support/CPP/limits.h" // numeric_limits #include "src/__support/CPP/type_traits.h" -#include "src/__support/macros/attributes.h" // LIBC_INLINE -#include "src/__support/macros/config.h" +#include "src/__support/macros/attributes.h" // LIBC_INLINE +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/math_extras.h" @@ -50,6 +51,12 @@ template struct FXBits { static constexpr StorageType SIGN_MASK = (fx_rep::SIGN_LEN == 0 ? 0 : StorageType(1) << SIGN_OFFSET); + // mask for + static constexpr StorageType VALUE_MASK = INTEGRAL_MASK | FRACTION_MASK; + + // mask for + static constexpr StorageType TOTAL_MASK = SIGN_MASK | VALUE_MASK; + public: LIBC_INLINE constexpr FXBits() = default; @@ -74,6 +81,12 @@ template struct FXBits { return (value & INTEGRAL_MASK) >> INTEGRAL_OFFSET; } + // returns complete bitstring representation the fixed point number + // the bitstring is of the form: padding | sign | integral | fraction + LIBC_INLINE constexpr StorageType get_bits() { + return (value & TOTAL_MASK) >> FRACTION_OFFSET; + } + // TODO: replace bool with Sign LIBC_INLINE constexpr bool get_sign() { return static_cast((value & SIGN_MASK) >> SIGN_OFFSET); @@ -163,6 +176,24 @@ template LIBC_INLINE constexpr T round(T x, int n) { return bit_and((x + round_bit), rounding_mask); } +// count leading sign bits +// TODO: support fixed_point_padding +template +LIBC_INLINE constexpr cpp::enable_if_t, int> +countls(T f) { + using FXRep = FXRep; + using BitType = typename FXRep::StorageType; + using FXBits = FXBits; + + if constexpr (FXRep::SIGN_LEN > 0) { + if (f < 0) + f = bit_not(f); + } + + BitType value_bits = FXBits(f).get_bits(); + return cpp::countl_zero(value_bits) - FXRep::SIGN_LEN; +} + } // namespace fixed_point } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/fixed_point/fx_rep.h b/libc/src/__support/fixed_point/fx_rep.h index 186938947694e..7227fffa683a8 100644 --- a/libc/src/__support/fixed_point/fx_rep.h +++ b/libc/src/__support/fixed_point/fx_rep.h @@ -43,8 +43,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = 0; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SFRACT_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return SFRACT_MIN; } LIBC_INLINE static constexpr Type MAX() { return SFRACT_MAX; } @@ -63,8 +63,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 0; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = 0; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = USFRACT_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return USFRACT_MIN; } LIBC_INLINE static constexpr Type MAX() { return USFRACT_MAX; } @@ -83,8 +83,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = 0; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = FRACT_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return FRACT_MIN; } LIBC_INLINE static constexpr Type MAX() { return FRACT_MAX; } @@ -103,8 +103,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 0; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = 0; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = UFRACT_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return UFRACT_MIN; } LIBC_INLINE static constexpr Type MAX() { return UFRACT_MAX; } @@ -123,8 +123,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = 0; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = LFRACT_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return LFRACT_MIN; } LIBC_INLINE static constexpr Type MAX() { return LFRACT_MAX; } @@ -143,8 +143,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 0; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = 0; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = ULFRACT_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return ULFRACT_MIN; } LIBC_INLINE static constexpr Type MAX() { return ULFRACT_MAX; } @@ -163,8 +163,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = SACCUM_IBIT; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SACCUM_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return SACCUM_MIN; } LIBC_INLINE static constexpr Type MAX() { return SACCUM_MAX; } @@ -183,8 +183,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 0; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = USACCUM_IBIT; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = USACCUM_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return USACCUM_MIN; } LIBC_INLINE static constexpr Type MAX() { return USACCUM_MAX; } @@ -203,8 +203,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = ACCUM_IBIT; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = ACCUM_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return ACCUM_MIN; } LIBC_INLINE static constexpr Type MAX() { return ACCUM_MAX; } @@ -223,8 +223,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 0; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = UACCUM_IBIT; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = UACCUM_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return UACCUM_MIN; } LIBC_INLINE static constexpr Type MAX() { return UACCUM_MAX; } @@ -243,8 +243,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = LACCUM_IBIT; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = LACCUM_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return LACCUM_MIN; } LIBC_INLINE static constexpr Type MAX() { return LACCUM_MAX; } @@ -263,8 +263,8 @@ template <> struct FXRep { LIBC_INLINE_VAR static constexpr int SIGN_LEN = 0; LIBC_INLINE_VAR static constexpr int INTEGRAL_LEN = ULACCUM_IBIT; LIBC_INLINE_VAR static constexpr int FRACTION_LEN = ULACCUM_FBIT; - LIBC_INLINE_VAR static constexpr int TOTAL_LEN = - SIGN_LEN + INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int VALUE_LEN = INTEGRAL_LEN + FRACTION_LEN; + LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + VALUE_LEN; LIBC_INLINE static constexpr Type MIN() { return ULACCUM_MIN; } LIBC_INLINE static constexpr Type MAX() { return ULACCUM_MAX; } diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h index a2634950d431b..253843e5e37aa 100644 --- a/libc/src/__support/macros/optimization.h +++ b/libc/src/__support/macros/optimization.h @@ -45,6 +45,7 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) { #define LIBC_MATH_FAST \ (LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES | \ LIBC_MATH_NO_ERRNO | LIBC_MATH_NO_EXCEPT) +#define LIBC_MATH_INTERMEDIATE_COMP_IN_FLOAT 0x10 #ifndef LIBC_MATH #define LIBC_MATH 0 @@ -58,4 +59,8 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) { #define LIBC_MATH_HAS_SMALL_TABLES #endif +#if (LIBC_MATH & LIBC_MATH_INTERMEDIATE_COMP_IN_FLOAT) +#define LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT +#endif + #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 9faf46d491426..2bda741b453f5 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -4052,8 +4052,10 @@ add_entrypoint_object( atan2f.cpp HDRS ../atan2f.h + atan2f_float.h DEPENDS .inv_trigf_utils + libc.src.__support.FPUtil.double_double libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.multiply_add libc.src.__support.FPUtil.nearest_integer diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp index db7639396cdd7..5ac2b29438ea9 100644 --- a/libc/src/math/generic/atan2f.cpp +++ b/libc/src/math/generic/atan2f.cpp @@ -17,6 +17,14 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) && \ + defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT) + +// We use float-float implementation to reduce size. +#include "src/math/generic/atan2f_float.h" + +#else + namespace LIBC_NAMESPACE_DECL { namespace { @@ -324,3 +332,5 @@ LLVM_LIBC_FUNCTION(float, atan2f, (float y, float x)) { } } // namespace LIBC_NAMESPACE_DECL + +#endif diff --git a/libc/src/math/generic/atan2f_float.h b/libc/src/math/generic/atan2f_float.h new file mode 100644 index 0000000000000..1fd853d735950 --- /dev/null +++ b/libc/src/math/generic/atan2f_float.h @@ -0,0 +1,237 @@ +//===-- Single-precision atan2f function ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/double_double.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/nearest_integer.h" +#include "src/__support/FPUtil/rounding_mode.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/math/atan2f.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace { + +using FloatFloat = fputil::FloatFloat; + +// atan(i/64) with i = 0..16, generated by Sollya with: +// > for i from 0 to 16 do { +// a = round(atan(i/16), SG, RN); +// b = round(atan(i/16) - a, SG, RN); +// print("{", b, ",", a, "},"); +// }; +constexpr FloatFloat ATAN_I[17] = { + {0.0f, 0.0f}, + {-0x1.1a6042p-30f, 0x1.ff55bcp-5f}, + {-0x1.54f424p-30f, 0x1.fd5baap-4f}, + {0x1.79cb6p-28f, 0x1.7b97b4p-3f}, + {-0x1.b4dfc8p-29f, 0x1.f5b76p-3f}, + {-0x1.1f0286p-27f, 0x1.362774p-2f}, + {0x1.e4defp-30f, 0x1.6f6194p-2f}, + {0x1.e611fep-29f, 0x1.a64eecp-2f}, + {0x1.586ed4p-28f, 0x1.dac67p-2f}, + {-0x1.6499e6p-26f, 0x1.0657eap-1f}, + {0x1.7bdfd6p-26f, 0x1.1e00bap-1f}, + {-0x1.98e422p-28f, 0x1.345f02p-1f}, + {0x1.934f7p-28f, 0x1.4978fap-1f}, + {0x1.c5a6c6p-27f, 0x1.5d5898p-1f}, + {0x1.5e118cp-27f, 0x1.700a7cp-1f}, + {-0x1.1d4eb6p-26f, 0x1.819d0cp-1f}, + {-0x1.777a5cp-26f, 0x1.921fb6p-1f}, +}; + +// Approximate atan(x) for |x| <= 2^-5. +// Using degree-3 Taylor polynomial: +// P = x - x^3/3 +// Then the absolute error is bounded by: +// |atan(x) - P(x)| < |x|^5/5 < 2^(-5*5) / 5 < 2^-27. +// And the relative error is bounded by: +// |(atan(x) - P(x))/atan(x)| < |x|^4 / 4 < 2^-22. +// For x = x_hi + x_lo, fully expand the polynomial and drop any terms less than +// ulp(x_hi^3 / 3) gives us: +// P(x) ~ x_hi - x_hi^3/3 + x_lo * (1 - x_hi^2) +FloatFloat atan_eval(const FloatFloat &x) { + FloatFloat p; + p.hi = x.hi; + float x_hi_sq = x.hi * x.hi; + // c0 ~ - x_hi^2 / 3 + float c0 = -0x1.555556p-2f * x_hi_sq; + // c1 ~ x_lo * (1 - x_hi^2) + float c1 = fputil::multiply_add(x_hi_sq, -x.lo, x.lo); + // p.lo ~ - x_hi^3 / 3 + x_lo * (1 - x_hi*2) + p.lo = fputil::multiply_add(x.hi, c0, c1); + return p; +} + +} // anonymous namespace + +// There are several range reduction steps we can take for atan2(y, x) as +// follow: + +// * Range reduction 1: signness +// atan2(y, x) will return a number between -PI and PI representing the angle +// forming by the 0x axis and the vector (x, y) on the 0xy-plane. +// In particular, we have that: +// atan2(y, x) = atan( y/x ) if x >= 0 and y >= 0 (I-quadrant) +// = pi + atan( y/x ) if x < 0 and y >= 0 (II-quadrant) +// = -pi + atan( y/x ) if x < 0 and y < 0 (III-quadrant) +// = atan( y/x ) if x >= 0 and y < 0 (IV-quadrant) +// Since atan function is odd, we can use the formula: +// atan(-u) = -atan(u) +// to adjust the above conditions a bit further: +// atan2(y, x) = atan( |y|/|x| ) if x >= 0 and y >= 0 (I-quadrant) +// = pi - atan( |y|/|x| ) if x < 0 and y >= 0 (II-quadrant) +// = -pi + atan( |y|/|x| ) if x < 0 and y < 0 (III-quadrant) +// = -atan( |y|/|x| ) if x >= 0 and y < 0 (IV-quadrant) +// Which can be simplified to: +// atan2(y, x) = sign(y) * atan( |y|/|x| ) if x >= 0 +// = sign(y) * (pi - atan( |y|/|x| )) if x < 0 + +// * Range reduction 2: reciprocal +// Now that the argument inside atan is positive, we can use the formula: +// atan(1/x) = pi/2 - atan(x) +// to make the argument inside atan <= 1 as follow: +// atan2(y, x) = sign(y) * atan( |y|/|x|) if 0 <= |y| <= x +// = sign(y) * (pi/2 - atan( |x|/|y| ) if 0 <= x < |y| +// = sign(y) * (pi - atan( |y|/|x| )) if 0 <= |y| <= -x +// = sign(y) * (pi/2 + atan( |x|/|y| )) if 0 <= -x < |y| + +// * Range reduction 3: look up table. +// After the previous two range reduction steps, we reduce the problem to +// compute atan(u) with 0 <= u <= 1, or to be precise: +// atan( n / d ) where n = min(|x|, |y|) and d = max(|x|, |y|). +// An accurate polynomial approximation for the whole [0, 1] input range will +// require a very large degree. To make it more efficient, we reduce the input +// range further by finding an integer idx such that: +// | n/d - idx/16 | <= 1/32. +// In particular, +// idx := 2^-4 * round(2^4 * n/d) +// Then for the fast pass, we find a polynomial approximation for: +// atan( n/d ) ~ atan( idx/16 ) + (n/d - idx/16) * Q(n/d - idx/16) +// with Q(x) = x - x^3/3 be the cubic Taylor polynomial of atan(x). +// It's error in float-float precision is estimated in Sollya to be: +// > P = x - x^3/3; +// > dirtyinfnorm(atan(x) - P, [-2^-5, 2^-5]); +// 0x1.995...p-28. + +LLVM_LIBC_FUNCTION(float, atan2f, (float y, float x)) { + using FPBits = typename fputil::FPBits; + constexpr float IS_NEG[2] = {1.0f, -1.0f}; + constexpr FloatFloat ZERO = {0.0f, 0.0f}; + constexpr FloatFloat MZERO = {-0.0f, -0.0f}; + constexpr FloatFloat PI = {-0x1.777a5cp-24f, 0x1.921fb6p1f}; + constexpr FloatFloat MPI = {0x1.777a5cp-24f, -0x1.921fb6p1f}; + constexpr FloatFloat PI_OVER_4 = {-0x1.777a5cp-26f, 0x1.921fb6p-1f}; + constexpr FloatFloat PI_OVER_2 = {-0x1.777a5cp-25f, 0x1.921fb6p0f}; + constexpr FloatFloat MPI_OVER_2 = {-0x1.777a5cp-25f, 0x1.921fb6p0f}; + constexpr FloatFloat THREE_PI_OVER_4 = {-0x1.99bc5cp-28f, 0x1.2d97c8p1f}; + // Adjustment for constant term: + // CONST_ADJ[x_sign][y_sign][recip] + constexpr FloatFloat CONST_ADJ[2][2][2] = { + {{ZERO, MPI_OVER_2}, {MZERO, MPI_OVER_2}}, + {{MPI, PI_OVER_2}, {MPI, PI_OVER_2}}}; + + FPBits x_bits(x), y_bits(y); + bool x_sign = x_bits.sign().is_neg(); + bool y_sign = y_bits.sign().is_neg(); + x_bits = x_bits.abs(); + y_bits = y_bits.abs(); + uint32_t x_abs = x_bits.uintval(); + uint32_t y_abs = y_bits.uintval(); + bool recip = x_abs < y_abs; + uint32_t min_abs = recip ? x_abs : y_abs; + uint32_t max_abs = !recip ? x_abs : y_abs; + auto min_exp = static_cast(min_abs >> FPBits::FRACTION_LEN); + auto max_exp = static_cast(max_abs >> FPBits::FRACTION_LEN); + + float num = FPBits(min_abs).get_val(); + float den = FPBits(max_abs).get_val(); + + // Check for exceptional cases, whether inputs are 0, inf, nan, or close to + // overflow, or close to underflow. + if (LIBC_UNLIKELY(max_exp > 0xffU - 64U || min_exp < 64U)) { + if (x_bits.is_nan() || y_bits.is_nan()) + return FPBits::quiet_nan().get_val(); + unsigned x_except = x == 0.0f ? 0 : (FPBits(x_abs).is_inf() ? 2 : 1); + unsigned y_except = y == 0.0f ? 0 : (FPBits(y_abs).is_inf() ? 2 : 1); + + // Exceptional cases: + // EXCEPT[y_except][x_except][x_is_neg] + // with x_except & y_except: + // 0: zero + // 1: finite, non-zero + // 2: infinity + constexpr FloatFloat EXCEPTS[3][3][2] = { + {{ZERO, PI}, {ZERO, PI}, {ZERO, PI}}, + {{PI_OVER_2, PI_OVER_2}, {ZERO, ZERO}, {ZERO, PI}}, + {{PI_OVER_2, PI_OVER_2}, + {PI_OVER_2, PI_OVER_2}, + {PI_OVER_4, THREE_PI_OVER_4}}, + }; + + if ((x_except != 1) || (y_except != 1)) { + FloatFloat r = EXCEPTS[y_except][x_except][x_sign]; + return fputil::multiply_add(IS_NEG[y_sign], r.hi, IS_NEG[y_sign] * r.lo); + } + bool scale_up = min_exp < 64U; + bool scale_down = max_exp > 0xffU - 64U; + // At least one input is denormal, multiply both numerator and denominator + // by some large enough power of 2 to normalize denormal inputs. + if (scale_up) { + num *= 0x1.0p32f; + if (!scale_down) + den *= 0x1.0p32f; + } else if (scale_down) { + den *= 0x1.0p-32f; + num *= 0x1.0p-32f; + } + + min_abs = FPBits(num).uintval(); + max_abs = FPBits(den).uintval(); + min_exp = static_cast(min_abs >> FPBits::FRACTION_LEN); + max_exp = static_cast(max_abs >> FPBits::FRACTION_LEN); + } + + float final_sign = IS_NEG[(x_sign != y_sign) != recip]; + FloatFloat const_term = CONST_ADJ[x_sign][y_sign][recip]; + unsigned exp_diff = max_exp - min_exp; + // We have the following bound for normalized n and d: + // 2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1). + if (LIBC_UNLIKELY(exp_diff > 25)) + return fputil::multiply_add(final_sign, const_term.hi, + final_sign * (const_term.lo + num / den)); + + float k = fputil::nearest_integer(16.0f * num / den); + unsigned idx = static_cast(k); + // k = idx / 16 + k *= 0x1.0p-4f; + + // Range reduction: + // atan(n/d) - atan(k/64) = atan((n/d - k/16) / (1 + (n/d) * (k/16))) + // = atan((n - d * k/16)) / (d + n * k/16)) + FloatFloat num_k = fputil::exact_mult(num, k); + FloatFloat den_k = fputil::exact_mult(den, k); + + // num_dd = n - d * k + FloatFloat num_ff = fputil::exact_add(num - den_k.hi, -den_k.lo); + // den_dd = d + n * k + FloatFloat den_ff = fputil::exact_add(den, num_k.hi); + den_ff.lo += num_k.lo; + + // q = (n - d * k) / (d + n * k) + FloatFloat q = fputil::div(num_ff, den_ff); + // p ~ atan(q) + FloatFloat p = atan_eval(q); + + FloatFloat r = fputil::add(const_term, fputil::add(ATAN_I[idx], p)); + return final_sign * r.hi; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp index 213dbd959039c..a2a0bb698f81a 100644 --- a/libc/src/math/generic/pow.cpp +++ b/libc/src/math/generic/pow.cpp @@ -400,7 +400,7 @@ LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) { #else double c = FPBits(m_x.uintval() & 0x3fff'e000'0000'0000).get_val(); dx = fputil::multiply_add(RD[idx_x], m_x.get_val() - c, CD[idx_x]); // Exact - dx_c0 = fputil::exact_mult<28>(dx, COEFFS[0]); // Exact + dx_c0 = fputil::exact_mult(dx, COEFFS[0]); // Exact #endif // LIBC_TARGET_CPU_HAS_FMA double dx2 = dx * dx; diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h index 06aeb49495ad2..711a12219c847 100644 --- a/libc/src/math/generic/range_reduction_double_common.h +++ b/libc/src/math/generic/range_reduction_double_common.h @@ -21,7 +21,7 @@ namespace LIBC_NAMESPACE_DECL { #ifdef LIBC_TARGET_CPU_HAS_FMA -static constexpr unsigned SPLIT = DEFAULT_DOUBLE_SPLIT; +static constexpr unsigned SPLIT = fputil::DefaultSplit::VALUE; #else // When there is no-FMA instructions, in order to have exact product of 2 double // precision with directional roundings, we need to lower the precision of the diff --git a/libc/src/math/generic/range_reduction_double_fma.h b/libc/src/math/generic/range_reduction_double_fma.h index cab031c28baa1..8e0bc3a42462c 100644 --- a/libc/src/math/generic/range_reduction_double_fma.h +++ b/libc/src/math/generic/range_reduction_double_fma.h @@ -33,14 +33,14 @@ LIBC_INLINE unsigned LargeRangeReduction::fast(double x, DoubleDouble &u) { // 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78 x_reduced = xbits.get_val(); // x * c_hi = ph.hi + ph.lo exactly. - DoubleDouble ph = - fputil::exact_mult(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]); + DoubleDouble ph = fputil::exact_mult( + x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]); // x * c_mid = pm.hi + pm.lo exactly. - DoubleDouble pm = - fputil::exact_mult(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]); + DoubleDouble pm = fputil::exact_mult( + x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]); // x * c_lo = pl.hi + pl.lo exactly. - DoubleDouble pl = - fputil::exact_mult(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][2]); + DoubleDouble pl = fputil::exact_mult( + x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][2]); // Extract integral parts and fractional parts of (ph.lo + pm.hi). double sum_hi = ph.lo + pm.hi; double kd = fputil::nearest_integer(sum_hi); diff --git a/libc/src/math/generic/range_reduction_double_nofma.h b/libc/src/math/generic/range_reduction_double_nofma.h index 5640732947798..606c3f8185d61 100644 --- a/libc/src/math/generic/range_reduction_double_nofma.h +++ b/libc/src/math/generic/range_reduction_double_nofma.h @@ -34,14 +34,14 @@ LIBC_INLINE unsigned LargeRangeReduction::fast(double x, DoubleDouble &u) { x_reduced = xbits.get_val(); // x * c_hi = ph.hi + ph.lo exactly. DoubleDouble x_split = fputil::split(x_reduced); - DoubleDouble ph = fputil::exact_mult(x_split, x_reduced, - ONE_TWENTY_EIGHT_OVER_PI[idx][0]); + DoubleDouble ph = fputil::exact_mult( + x_split, x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]); // x * c_mid = pm.hi + pm.lo exactly. - DoubleDouble pm = fputil::exact_mult(x_split, x_reduced, - ONE_TWENTY_EIGHT_OVER_PI[idx][1]); + DoubleDouble pm = fputil::exact_mult( + x_split, x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]); // x * c_lo = pl.hi + pl.lo exactly. - DoubleDouble pl = fputil::exact_mult(x_split, x_reduced, - ONE_TWENTY_EIGHT_OVER_PI[idx][2]); + DoubleDouble pl = fputil::exact_mult( + x_split, x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][2]); // Extract integral parts and fractional parts of (ph.lo + pm.hi). double sum_hi = ph.lo + pm.hi; double kd = fputil::nearest_integer(sum_hi); diff --git a/libc/src/stdfix/CMakeLists.txt b/libc/src/stdfix/CMakeLists.txt index 37292d85367fe..6fb06b8d7e9ae 100644 --- a/libc/src/stdfix/CMakeLists.txt +++ b/libc/src/stdfix/CMakeLists.txt @@ -47,6 +47,18 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk) libc.src.__support.CPP.bit libc.src.__support.fixed_point.fx_bits ) + + add_entrypoint_object( + countls${suffix} + HDRS + countls${suffix}.h + SRCS + countls${suffix}.cpp + COMPILE_OPTIONS + ${libc_opt_high_flag} + DEPENDS + libc.src.__support.fixed_point.fx_bits + ) endforeach() add_entrypoint_object( diff --git a/libc/src/stdfix/countlshk.cpp b/libc/src/stdfix/countlshk.cpp new file mode 100644 index 0000000000000..f94728beff1cb --- /dev/null +++ b/libc/src/stdfix/countlshk.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlshk function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlshk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlshk, (short accum f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlshk.h b/libc/src/stdfix/countlshk.h new file mode 100644 index 0000000000000..ab334244e166a --- /dev/null +++ b/libc/src/stdfix/countlshk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlshk function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSHK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSHK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlshk(short accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSHK_H diff --git a/libc/src/stdfix/countlshr.cpp b/libc/src/stdfix/countlshr.cpp new file mode 100644 index 0000000000000..d77d3e9a3c22a --- /dev/null +++ b/libc/src/stdfix/countlshr.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlshr function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlshr.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlshr, (short fract f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlshr.h b/libc/src/stdfix/countlshr.h new file mode 100644 index 0000000000000..579b7b680406e --- /dev/null +++ b/libc/src/stdfix/countlshr.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlshr function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSHR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSHR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlshr(short fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSHR_H diff --git a/libc/src/stdfix/countlsk.cpp b/libc/src/stdfix/countlsk.cpp new file mode 100644 index 0000000000000..b6f56adee16a6 --- /dev/null +++ b/libc/src/stdfix/countlsk.cpp @@ -0,0 +1,18 @@ +//===-- Implementation for countlsk function -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsk, (accum f)) { return fixed_point::countls(f); } + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsk.h b/libc/src/stdfix/countlsk.h new file mode 100644 index 0000000000000..d0c893bc078d5 --- /dev/null +++ b/libc/src/stdfix/countlsk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsk function -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsk(accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSK_H diff --git a/libc/src/stdfix/countlslk.cpp b/libc/src/stdfix/countlslk.cpp new file mode 100644 index 0000000000000..9bf30ff34c6ee --- /dev/null +++ b/libc/src/stdfix/countlslk.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlslk function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlslk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlslk, (long accum f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlslk.h b/libc/src/stdfix/countlslk.h new file mode 100644 index 0000000000000..60fa469797b7a --- /dev/null +++ b/libc/src/stdfix/countlslk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlslk function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSLK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSLK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlslk(long accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSLK_H diff --git a/libc/src/stdfix/countlslr.cpp b/libc/src/stdfix/countlslr.cpp new file mode 100644 index 0000000000000..774023c734a37 --- /dev/null +++ b/libc/src/stdfix/countlslr.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlslr function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlslr.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlslr, (long fract f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlslr.h b/libc/src/stdfix/countlslr.h new file mode 100644 index 0000000000000..c909551e77a1a --- /dev/null +++ b/libc/src/stdfix/countlslr.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlslr function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSLR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSLR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlslr(long fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSLR_H diff --git a/libc/src/stdfix/countlsr.cpp b/libc/src/stdfix/countlsr.cpp new file mode 100644 index 0000000000000..14563127ad5e9 --- /dev/null +++ b/libc/src/stdfix/countlsr.cpp @@ -0,0 +1,18 @@ +//===-- Implementation for countlsr function -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsr.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsr, (fract f)) { return fixed_point::countls(f); } + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsr.h b/libc/src/stdfix/countlsr.h new file mode 100644 index 0000000000000..75dcf4aff0ca3 --- /dev/null +++ b/libc/src/stdfix/countlsr.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsr function -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsr(fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSR_H diff --git a/libc/src/stdfix/countlsuhk.cpp b/libc/src/stdfix/countlsuhk.cpp new file mode 100644 index 0000000000000..2cc266f47da1f --- /dev/null +++ b/libc/src/stdfix/countlsuhk.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsuhk function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsuhk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsuhk, (unsigned short accum f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsuhk.h b/libc/src/stdfix/countlsuhk.h new file mode 100644 index 0000000000000..fcb2fec3500d4 --- /dev/null +++ b/libc/src/stdfix/countlsuhk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsuhk function -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUHK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSUHK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsuhk(unsigned short accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUHK_H diff --git a/libc/src/stdfix/countlsuhr.cpp b/libc/src/stdfix/countlsuhr.cpp new file mode 100644 index 0000000000000..f30b0dd731aa9 --- /dev/null +++ b/libc/src/stdfix/countlsuhr.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsuhr function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsuhr.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsuhr, (unsigned short fract f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsuhr.h b/libc/src/stdfix/countlsuhr.h new file mode 100644 index 0000000000000..b60132dc7f22b --- /dev/null +++ b/libc/src/stdfix/countlsuhr.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsuhr function -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUHR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSUHR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsuhr(unsigned short fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUHR_H diff --git a/libc/src/stdfix/countlsuk.cpp b/libc/src/stdfix/countlsuk.cpp new file mode 100644 index 0000000000000..90617cfeb5cdc --- /dev/null +++ b/libc/src/stdfix/countlsuk.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsuk function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsuk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsuk, (unsigned accum f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsuk.h b/libc/src/stdfix/countlsuk.h new file mode 100644 index 0000000000000..7ad0e701b927b --- /dev/null +++ b/libc/src/stdfix/countlsuk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsuk function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSUK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsuk(unsigned accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUK_H diff --git a/libc/src/stdfix/countlsulk.cpp b/libc/src/stdfix/countlsulk.cpp new file mode 100644 index 0000000000000..04090dd86c732 --- /dev/null +++ b/libc/src/stdfix/countlsulk.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsulk function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsulk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsulk, (unsigned long accum f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsulk.h b/libc/src/stdfix/countlsulk.h new file mode 100644 index 0000000000000..55ca9d2e20ff0 --- /dev/null +++ b/libc/src/stdfix/countlsulk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsulk function -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSULK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSULK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsulk(unsigned long accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSULK_H diff --git a/libc/src/stdfix/countlsulr.cpp b/libc/src/stdfix/countlsulr.cpp new file mode 100644 index 0000000000000..d9d6ff404c211 --- /dev/null +++ b/libc/src/stdfix/countlsulr.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsulr function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsulr.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsulr, (unsigned long fract f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsulr.h b/libc/src/stdfix/countlsulr.h new file mode 100644 index 0000000000000..59e7d726d01b9 --- /dev/null +++ b/libc/src/stdfix/countlsulr.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsulr function -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSULR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSULR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsulr(unsigned long fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSULR_H diff --git a/libc/src/stdfix/countlsur.cpp b/libc/src/stdfix/countlsur.cpp new file mode 100644 index 0000000000000..777e5f387aadf --- /dev/null +++ b/libc/src/stdfix/countlsur.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsur function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsur.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsur, (unsigned fract f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsur.h b/libc/src/stdfix/countlsur.h new file mode 100644 index 0000000000000..1d34e971a52b3 --- /dev/null +++ b/libc/src/stdfix/countlsur.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsur function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSUR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsur(unsigned fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUR_H diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt index ef9bfe57bc4ec..dd28aa67280b7 100644 --- a/libc/src/time/CMakeLists.txt +++ b/libc/src/time/CMakeLists.txt @@ -22,6 +22,8 @@ add_object_library( DEPENDS libc.include.time libc.src.__support.CPP.limits + libc.src.__support.CPP.string_view + libc.src.__support.CPP.optional libc.src.errno.errno .time_constants libc.hdr.types.time_t diff --git a/libc/src/time/mktime.cpp b/libc/src/time/mktime.cpp index 3874cad02facb..fc05ff2930434 100644 --- a/libc/src/time/mktime.cpp +++ b/libc/src/time/mktime.cpp @@ -14,100 +14,8 @@ namespace LIBC_NAMESPACE_DECL { -// Returns number of years from (1, year). -static constexpr int64_t get_num_of_leap_years_before(int64_t year) { - return (year / 4) - (year / 100) + (year / 400); -} - -// Returns True if year is a leap year. -static constexpr bool is_leap_year(const int64_t year) { - return (((year) % 4) == 0 && (((year) % 100) != 0 || ((year) % 400) == 0)); -} - LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) { - // Unlike most C Library functions, mktime doesn't just die on bad input. - // TODO(rtenneti); Handle leap seconds. - int64_t tm_year_from_base = tm_out->tm_year + time_constants::TIME_YEAR_BASE; - - // 32-bit end-of-the-world is 03:14:07 UTC on 19 January 2038. - if (sizeof(time_t) == 4 && - tm_year_from_base >= time_constants::END_OF32_BIT_EPOCH_YEAR) { - if (tm_year_from_base > time_constants::END_OF32_BIT_EPOCH_YEAR) - return time_utils::out_of_range(); - if (tm_out->tm_mon > 0) - return time_utils::out_of_range(); - if (tm_out->tm_mday > 19) - return time_utils::out_of_range(); - else if (tm_out->tm_mday == 19) { - if (tm_out->tm_hour > 3) - return time_utils::out_of_range(); - else if (tm_out->tm_hour == 3) { - if (tm_out->tm_min > 14) - return time_utils::out_of_range(); - else if (tm_out->tm_min == 14) { - if (tm_out->tm_sec > 7) - return time_utils::out_of_range(); - } - } - } - } - - // Years are ints. A 32-bit year will fit into a 64-bit time_t. - // A 64-bit year will not. - static_assert( - sizeof(int) == 4, - "ILP64 is unimplemented. This implementation requires 32-bit integers."); - - // Calculate number of months and years from tm_mon. - int64_t month = tm_out->tm_mon; - if (month < 0 || month >= time_constants::MONTHS_PER_YEAR - 1) { - int64_t years = month / 12; - month %= 12; - if (month < 0) { - years--; - month += 12; - } - tm_year_from_base += years; - } - bool tm_year_is_leap = is_leap_year(tm_year_from_base); - - // Calculate total number of days based on the month and the day (tm_mday). - int64_t total_days = tm_out->tm_mday - 1; - for (int64_t i = 0; i < month; ++i) - total_days += time_constants::NON_LEAP_YEAR_DAYS_IN_MONTH[i]; - // Add one day if it is a leap year and the month is after February. - if (tm_year_is_leap && month > 1) - total_days++; - - // Calculate total numbers of days based on the year. - total_days += (tm_year_from_base - time_constants::EPOCH_YEAR) * - time_constants::DAYS_PER_NON_LEAP_YEAR; - if (tm_year_from_base >= time_constants::EPOCH_YEAR) { - total_days += get_num_of_leap_years_before(tm_year_from_base - 1) - - get_num_of_leap_years_before(time_constants::EPOCH_YEAR); - } else if (tm_year_from_base >= 1) { - total_days -= get_num_of_leap_years_before(time_constants::EPOCH_YEAR) - - get_num_of_leap_years_before(tm_year_from_base - 1); - } else { - // Calculate number of leap years until 0th year. - total_days -= get_num_of_leap_years_before(time_constants::EPOCH_YEAR) - - get_num_of_leap_years_before(0); - if (tm_year_from_base <= 0) { - total_days -= 1; // Subtract 1 for 0th year. - // Calculate number of leap years until -1 year - if (tm_year_from_base < 0) { - total_days -= get_num_of_leap_years_before(-tm_year_from_base) - - get_num_of_leap_years_before(1); - } - } - } - - // TODO: https://github.com/llvm/llvm-project/issues/121962 - // Need to handle timezone and update of tm_isdst. - int64_t seconds = tm_out->tm_sec + - tm_out->tm_min * time_constants::SECONDS_PER_MIN + - tm_out->tm_hour * time_constants::SECONDS_PER_HOUR + - total_days * time_constants::SECONDS_PER_DAY; + int64_t seconds = time_utils::mktime_internal(tm_out); // Update the tm structure's year, month, day, etc. from seconds. if (time_utils::update_from_seconds(seconds, tm_out) < 0) diff --git a/libc/src/time/time_constants.h b/libc/src/time/time_constants.h index 3e25f741745ab..bcf19ff5f193e 100644 --- a/libc/src/time/time_constants.h +++ b/libc/src/time/time_constants.h @@ -18,7 +18,7 @@ namespace LIBC_NAMESPACE_DECL { namespace time_constants { enum Month : int { - JANUARY, + JANUARY = 0, FEBRUARY, MARCH, APRIL, @@ -32,14 +32,28 @@ enum Month : int { DECEMBER }; +enum WeekDay : int { + SUNDAY = 0, + MONDAY, + TUESDAY, + WEDNESDAY, + THURSDAY, + FRIDAY, + SATURDAY +}; + constexpr int SECONDS_PER_MIN = 60; constexpr int MINUTES_PER_HOUR = 60; constexpr int HOURS_PER_DAY = 24; constexpr int DAYS_PER_WEEK = 7; +constexpr int WEEKS_PER_YEAR = 52; constexpr int MONTHS_PER_YEAR = 12; constexpr int DAYS_PER_NON_LEAP_YEAR = 365; constexpr int DAYS_PER_LEAP_YEAR = 366; +constexpr int LAST_DAY_OF_NON_LEAP_YEAR = DAYS_PER_NON_LEAP_YEAR - 1; +constexpr int LAST_DAY_OF_LEAP_YEAR = DAYS_PER_LEAP_YEAR - 1; + constexpr int SECONDS_PER_HOUR = SECONDS_PER_MIN * MINUTES_PER_HOUR; constexpr int SECONDS_PER_DAY = SECONDS_PER_HOUR * HOURS_PER_DAY; constexpr int NUMBER_OF_SECONDS_IN_LEAP_YEAR = @@ -49,6 +63,8 @@ constexpr int TIME_YEAR_BASE = 1900; constexpr int EPOCH_YEAR = 1970; constexpr int EPOCH_WEEK_DAY = 4; +constexpr int ISO_FIRST_DAY_OF_YEAR = 3; // the 4th day of the year, 0-indexed. + // For asctime the behavior is undefined if struct tm's tm_wday or tm_mon are // not within the normal ranges as defined in , or if struct tm's // tm_year exceeds {INT_MAX}-1990, or if the below asctime_internal algorithm diff --git a/libc/src/time/time_utils.cpp b/libc/src/time/time_utils.cpp index abc93b8cb961e..3ccb2dd934967 100644 --- a/libc/src/time/time_utils.cpp +++ b/libc/src/time/time_utils.cpp @@ -12,9 +12,103 @@ #include "src/__support/macros/config.h" #include "src/time/time_constants.h" +#include + namespace LIBC_NAMESPACE_DECL { namespace time_utils { +// TODO: clean this up in a followup patch +int64_t mktime_internal(const tm *tm_out) { + // Unlike most C Library functions, mktime doesn't just die on bad input. + // TODO(rtenneti); Handle leap seconds. + int64_t tm_year_from_base = tm_out->tm_year + time_constants::TIME_YEAR_BASE; + + // 32-bit end-of-the-world is 03:14:07 UTC on 19 January 2038. + if (sizeof(time_t) == 4 && + tm_year_from_base >= time_constants::END_OF32_BIT_EPOCH_YEAR) { + if (tm_year_from_base > time_constants::END_OF32_BIT_EPOCH_YEAR) + return time_utils::out_of_range(); + if (tm_out->tm_mon > 0) + return time_utils::out_of_range(); + if (tm_out->tm_mday > 19) + return time_utils::out_of_range(); + else if (tm_out->tm_mday == 19) { + if (tm_out->tm_hour > 3) + return time_utils::out_of_range(); + else if (tm_out->tm_hour == 3) { + if (tm_out->tm_min > 14) + return time_utils::out_of_range(); + else if (tm_out->tm_min == 14) { + if (tm_out->tm_sec > 7) + return time_utils::out_of_range(); + } + } + } + } + + // Years are ints. A 32-bit year will fit into a 64-bit time_t. + // A 64-bit year will not. + static_assert( + sizeof(int) == 4, + "ILP64 is unimplemented. This implementation requires 32-bit integers."); + + // Calculate number of months and years from tm_mon. + int64_t month = tm_out->tm_mon; + if (month < 0 || month >= time_constants::MONTHS_PER_YEAR - 1) { + int64_t years = month / 12; + month %= 12; + if (month < 0) { + years--; + month += 12; + } + tm_year_from_base += years; + } + bool tm_year_is_leap = time_utils::is_leap_year(tm_year_from_base); + + // Calculate total number of days based on the month and the day (tm_mday). + int64_t total_days = tm_out->tm_mday - 1; + for (int64_t i = 0; i < month; ++i) + total_days += time_constants::NON_LEAP_YEAR_DAYS_IN_MONTH[i]; + // Add one day if it is a leap year and the month is after February. + if (tm_year_is_leap && month > 1) + total_days++; + + // Calculate total numbers of days based on the year. + total_days += (tm_year_from_base - time_constants::EPOCH_YEAR) * + time_constants::DAYS_PER_NON_LEAP_YEAR; + if (tm_year_from_base >= time_constants::EPOCH_YEAR) { + total_days += + time_utils::get_num_of_leap_years_before(tm_year_from_base - 1) - + time_utils::get_num_of_leap_years_before(time_constants::EPOCH_YEAR); + } else if (tm_year_from_base >= 1) { + total_days -= + time_utils::get_num_of_leap_years_before(time_constants::EPOCH_YEAR) - + time_utils::get_num_of_leap_years_before(tm_year_from_base - 1); + } else { + // Calculate number of leap years until 0th year. + total_days -= + time_utils::get_num_of_leap_years_before(time_constants::EPOCH_YEAR) - + time_utils::get_num_of_leap_years_before(0); + if (tm_year_from_base <= 0) { + total_days -= 1; // Subtract 1 for 0th year. + // Calculate number of leap years until -1 year + if (tm_year_from_base < 0) { + total_days -= + time_utils::get_num_of_leap_years_before(-tm_year_from_base) - + time_utils::get_num_of_leap_years_before(1); + } + } + } + + // TODO: https://github.com/llvm/llvm-project/issues/121962 + // Need to handle timezone and update of tm_isdst. + int64_t seconds = tm_out->tm_sec + + tm_out->tm_min * time_constants::SECONDS_PER_MIN + + tm_out->tm_hour * time_constants::SECONDS_PER_HOUR + + total_days * time_constants::SECONDS_PER_DAY; + return seconds; +} + static int64_t computeRemainingYears(int64_t daysPerYears, int64_t quotientYears, int64_t *remainingDays) { @@ -42,7 +136,7 @@ static int64_t computeRemainingYears(int64_t daysPerYears, // // Compute the number of months from the remaining days. Finally, adjust years // to be 1900 and months to be from January. -int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) { +int64_t update_from_seconds(int64_t total_seconds, tm *tm) { // Days in month starting from March in the year 2000. static const char daysInMonth[] = {31 /* Mar */, 30, 31, 30, 31, 31, 30, 31, 30, 31, 31, 29}; diff --git a/libc/src/time/time_utils.h b/libc/src/time/time_utils.h index 5e0a692d4db04..68eaac8c04f11 100644 --- a/libc/src/time/time_utils.h +++ b/libc/src/time/time_utils.h @@ -12,6 +12,8 @@ #include "hdr/types/size_t.h" #include "hdr/types/struct_tm.h" #include "hdr/types/time_t.h" +#include "src/__support/CPP/optional.h" +#include "src/__support/CPP/string_view.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/errno/libc_errno.h" @@ -22,9 +24,13 @@ namespace LIBC_NAMESPACE_DECL { namespace time_utils { +// calculates the seconds from the epoch for tm_in. Does not update the struct, +// you must call update_from_seconds for that. +int64_t mktime_internal(const tm *tm_out); + // Update the "tm" structure's year, month, etc. members from seconds. // "total_seconds" is the number of seconds since January 1st, 1970. -extern int64_t update_from_seconds(int64_t total_seconds, struct tm *tm); +int64_t update_from_seconds(int64_t total_seconds, tm *tm); // TODO(michaelrj): move these functions to use ErrorOr instead of setting // errno. They always accompany a specific return value so we only need the one @@ -43,7 +49,7 @@ LIBC_INLINE time_t out_of_range() { LIBC_INLINE void invalid_value() { libc_errno = EINVAL; } -LIBC_INLINE char *asctime(const struct tm *timeptr, char *buffer, +LIBC_INLINE char *asctime(const tm *timeptr, char *buffer, size_t bufferLength) { if (timeptr == nullptr || buffer == nullptr) { invalid_value(); @@ -61,6 +67,7 @@ LIBC_INLINE char *asctime(const struct tm *timeptr, char *buffer, } // TODO(michaelr): move this to use the strftime machinery + // equivalent to strftime(buffer, bufferLength, "%a %b %T %Y\n", timeptr) int written_size = __builtin_snprintf( buffer, bufferLength, "%.3s %.3s%3d %.2d:%.2d:%.2d %d\n", time_constants::WEEK_DAY_NAMES[timeptr->tm_wday].data(), @@ -76,7 +83,7 @@ LIBC_INLINE char *asctime(const struct tm *timeptr, char *buffer, return buffer; } -LIBC_INLINE struct tm *gmtime_internal(const time_t *timer, struct tm *result) { +LIBC_INLINE tm *gmtime_internal(const time_t *timer, tm *result) { int64_t seconds = *timer; // Update the tm structure's year, month, day, etc. from seconds. if (update_from_seconds(seconds, result) < 0) { @@ -89,11 +96,252 @@ LIBC_INLINE struct tm *gmtime_internal(const time_t *timer, struct tm *result) { // TODO: localtime is not yet implemented and a temporary solution is to // use gmtime, https://github.com/llvm/llvm-project/issues/107597 -LIBC_INLINE struct tm *localtime(const time_t *t_ptr) { - static struct tm result; +LIBC_INLINE tm *localtime(const time_t *t_ptr) { + static tm result; return time_utils::gmtime_internal(t_ptr, &result); } +// Returns number of years from (1, year). +LIBC_INLINE constexpr int64_t get_num_of_leap_years_before(int64_t year) { + return (year / 4) - (year / 100) + (year / 400); +} + +// Returns True if year is a leap year. +LIBC_INLINE constexpr bool is_leap_year(const int64_t year) { + return (((year) % 4) == 0 && (((year) % 100) != 0 || ((year) % 400) == 0)); +} + +LIBC_INLINE constexpr int get_days_in_year(const int year) { + return is_leap_year(year) ? time_constants::DAYS_PER_LEAP_YEAR + : time_constants::DAYS_PER_NON_LEAP_YEAR; +} + +// This is a helper class that takes a struct tm and lets you inspect its +// values. Where relevant, results are bounds checked and returned as optionals. +// This class does not, however, do data normalization except where necessary. +// It will faithfully return a date of 9999-99-99, even though that makes no +// sense. +class TMReader final { + const tm *timeptr; + + template + LIBC_INLINE constexpr cpp::optional + bounds_check(const cpp::array &arr, int index) const { + if (index >= 0 && index < static_cast(arr.size())) + return arr[index]; + return cpp::nullopt; + } + +public: + LIBC_INLINE constexpr explicit TMReader(const tm *tmptr) : timeptr(tmptr) {} + + // Strings + LIBC_INLINE constexpr cpp::optional + get_weekday_short_name() const { + return bounds_check(time_constants::WEEK_DAY_NAMES, timeptr->tm_wday); + } + + LIBC_INLINE constexpr cpp::optional + get_weekday_full_name() const { + return bounds_check(time_constants::WEEK_DAY_FULL_NAMES, timeptr->tm_wday); + } + + LIBC_INLINE constexpr cpp::optional + get_month_short_name() const { + return bounds_check(time_constants::MONTH_NAMES, timeptr->tm_mon); + } + + LIBC_INLINE constexpr cpp::optional + get_month_full_name() const { + return bounds_check(time_constants::MONTH_FULL_NAMES, timeptr->tm_mon); + } + + LIBC_INLINE constexpr cpp::string_view get_am_pm() const { + if (timeptr->tm_hour < 12) + return "AM"; + return "PM"; + } + + LIBC_INLINE constexpr cpp::string_view get_timezone_name() const { + // TODO: timezone support + return "UTC"; + } + + // Numbers + LIBC_INLINE constexpr int get_sec() const { return timeptr->tm_sec; } + LIBC_INLINE constexpr int get_min() const { return timeptr->tm_min; } + LIBC_INLINE constexpr int get_hour() const { return timeptr->tm_hour; } + LIBC_INLINE constexpr int get_mday() const { return timeptr->tm_mday; } + LIBC_INLINE constexpr int get_mon() const { return timeptr->tm_mon; } + LIBC_INLINE constexpr int get_yday() const { return timeptr->tm_yday; } + LIBC_INLINE constexpr int get_wday() const { return timeptr->tm_wday; } + LIBC_INLINE constexpr int get_isdst() const { return timeptr->tm_isdst; } + + // returns the year, counting from 1900 + LIBC_INLINE constexpr int get_year_raw() const { return timeptr->tm_year; } + // returns the year, counting from 0 + LIBC_INLINE constexpr int get_year() const { + return timeptr->tm_year + time_constants::TIME_YEAR_BASE; + } + + LIBC_INLINE constexpr int is_leap_year() const { + return time_utils::is_leap_year(get_year()); + } + + LIBC_INLINE constexpr int get_iso_wday() const { + using time_constants::DAYS_PER_WEEK; + using time_constants::MONDAY; + // ISO uses a week that starts on Monday, but struct tm starts its week on + // Sunday. This function normalizes the weekday so that it always returns a + // value 0-6 + const int NORMALIZED_WDAY = timeptr->tm_wday % DAYS_PER_WEEK; + return (NORMALIZED_WDAY + (DAYS_PER_WEEK - MONDAY)) % DAYS_PER_WEEK; + } + + // returns the week of the current year, with weeks starting on start_day. + LIBC_INLINE constexpr int get_week(time_constants::WeekDay start_day) const { + using time_constants::DAYS_PER_WEEK; + // The most recent start_day. The rest of the days into the current week + // don't count, so ignore them. + // Also add 7 to handle start_day > tm_wday + const int start_of_cur_week = + timeptr->tm_yday - + ((timeptr->tm_wday + DAYS_PER_WEEK - start_day) % DAYS_PER_WEEK); + + // The original formula is ceil((start_of_cur_week + 1) / DAYS_PER_WEEK) + // That becomes (start_of_cur_week + 1 + DAYS_PER_WEEK - 1) / DAYS_PER_WEEK) + // Which simplifies to (start_of_cur_week + DAYS_PER_WEEK) / DAYS_PER_WEEK + const int ceil_weeks_since_start = + (start_of_cur_week + DAYS_PER_WEEK) / DAYS_PER_WEEK; + + return ceil_weeks_since_start; + } + + LIBC_INLINE constexpr int get_iso_week() const { + using time_constants::DAYS_PER_WEEK; + using time_constants::ISO_FIRST_DAY_OF_YEAR; + using time_constants::MONDAY; + using time_constants::WeekDay; + using time_constants::WEEKS_PER_YEAR; + + constexpr WeekDay START_DAY = MONDAY; + + // The most recent start_day. The rest of the days into the current week + // don't count, so ignore them. + // Also add 7 to handle start_day > tm_wday + const int start_of_cur_week = + timeptr->tm_yday - + ((timeptr->tm_wday + DAYS_PER_WEEK - START_DAY) % DAYS_PER_WEEK); + + // if the week starts in the previous year, and also if the 4th of this year + // is not in this week. + if (start_of_cur_week < -3) { + const int days_into_prev_year = + get_days_in_year(get_year() - 1) + start_of_cur_week; + // Each year has at least 52 weeks, but a year's last week will be 53 if + // its first week starts in the previous year and its last week ends + // in the next year. We know get_year() - 1 must extend into get_year(), + // so here we check if it also extended into get_year() - 2 and add 1 week + // if it does. + return WEEKS_PER_YEAR + + ((days_into_prev_year % DAYS_PER_WEEK) > ISO_FIRST_DAY_OF_YEAR); + } + + // subtract 1 to account for yday being 0 indexed + const int days_until_end_of_year = + get_days_in_year(get_year()) - start_of_cur_week - 1; + + // if there are less than 3 days from the start of this week to the end of + // the year, then there must be 4 days in this week in the next year, which + // means that this week is the first week of that year. + if (days_until_end_of_year < 3) + return 1; + + // else just calculate the current week like normal. + const int ceil_weeks_since_start = + (start_of_cur_week + DAYS_PER_WEEK) / DAYS_PER_WEEK; + + // add 1 if this year's first week starts in the previous year. + const int WEEK_STARTS_IN_PREV_YEAR = + ((start_of_cur_week + time_constants::DAYS_PER_WEEK) % + time_constants::DAYS_PER_WEEK) > time_constants::ISO_FIRST_DAY_OF_YEAR; + return ceil_weeks_since_start + WEEK_STARTS_IN_PREV_YEAR; + } + + LIBC_INLINE constexpr int get_iso_year() const { + const int BASE_YEAR = get_year(); + // The ISO year is the same as a standard year for all dates after the start + // of the first week and before the last week. Since the first ISO week of a + // year starts on the 4th, anything after that is in this year. + if (timeptr->tm_yday >= time_constants::ISO_FIRST_DAY_OF_YEAR && + timeptr->tm_yday < time_constants::DAYS_PER_NON_LEAP_YEAR - + time_constants::DAYS_PER_WEEK) + return BASE_YEAR; + + const int ISO_WDAY = get_iso_wday(); + // The first week of the ISO year is defined as the week containing the + // 4th day of January. + + // first week + if (timeptr->tm_yday < time_constants::ISO_FIRST_DAY_OF_YEAR) { + /* + If jan 4 is in this week, then we're in BASE_YEAR, else we're in the + previous year. The formula's been rearranged so here's the derivation: + + +--------+-- days until jan 4 + | | + wday + (4 - yday) < 7 + | | + +---------------+-- weekday of jan 4 + + rearranged to get all the constants on one side: + + wday - yday < 7 - 4 + */ + const int IS_CUR_YEAR = (ISO_WDAY - timeptr->tm_yday < + time_constants::DAYS_PER_WEEK - + time_constants::ISO_FIRST_DAY_OF_YEAR); + return BASE_YEAR - !IS_CUR_YEAR; + } + + // last week + const int DAYS_LEFT_IN_YEAR = + get_days_in_year(get_year()) - timeptr->tm_yday; + /* + Similar to above, we're checking if jan 4 (of next year) is in this week. If + it is, this is in the next year. Note that this also handles the case of + yday > days in year gracefully. + + +------------------+-- days until jan 4 (of next year) + | | + wday + (4 + remaining days) < 7 + | | + +-------------------------+-- weekday of jan 4 + + rearranging we get: + + wday + remaining days < 7 - 4 + */ + const int IS_NEXT_YEAR = + (ISO_WDAY + DAYS_LEFT_IN_YEAR < + time_constants::DAYS_PER_WEEK - time_constants::ISO_FIRST_DAY_OF_YEAR); + return BASE_YEAR + IS_NEXT_YEAR; + } + + LIBC_INLINE time_t get_epoch() const { + return static_cast(mktime_internal(timeptr)); + } + + // returns the timezone offset in microwave time: + // return (hours * 100) + minutes; + // This means that a shift of -4:30 is returned as -430, simplifying + // conversion. + LIBC_INLINE constexpr int get_timezone_offset() const { + // TODO: timezone support + return 0; + } +}; + } // namespace time_utils } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/integration/src/pthread/pthread_mutex_test.cpp b/libc/test/integration/src/pthread/pthread_mutex_test.cpp index ce2a3538924da..137daed6bd283 100644 --- a/libc/test/integration/src/pthread/pthread_mutex_test.cpp +++ b/libc/test/integration/src/pthread/pthread_mutex_test.cpp @@ -186,6 +186,10 @@ void multiple_waiters() { LIBC_NAMESPACE::pthread_mutex_destroy(&counter_lock); } +// Test the initializer +[[maybe_unused]] +static pthread_mutex_t test_initializer = PTHREAD_MUTEX_INITIALIZER; + TEST_MAIN() { relay_counter(); wait_and_step(); diff --git a/libc/test/src/__support/CPP/bit_test.cpp b/libc/test/src/__support/CPP/bit_test.cpp index 9429b66ad1f98..1f2315281bc1d 100644 --- a/libc/test/src/__support/CPP/bit_test.cpp +++ b/libc/test/src/__support/CPP/bit_test.cpp @@ -41,7 +41,8 @@ TYPED_TEST(LlvmLibcBitTest, HasSingleBit, UnsignedTypes) { constexpr auto LSB = T(1); constexpr auto MSB = T(~(ALL_ONES >> 1)); for (T value = 1; value; value <<= 1) { - auto two_bits_value = value | ((value <= MIDPOINT) ? MSB : LSB); + T two_bits_value = + static_cast(value | ((value <= MIDPOINT) ? MSB : LSB)); EXPECT_FALSE(has_single_bit(two_bits_value)); } } diff --git a/libc/test/src/stdbit/stdc_bit_ceil_uc_test.cpp b/libc/test/src/stdbit/stdc_bit_ceil_uc_test.cpp index 1ef87b0d44de6..6915859b7c669 100644 --- a/libc/test/src/stdbit/stdc_bit_ceil_uc_test.cpp +++ b/libc/test/src/stdbit/stdc_bit_ceil_uc_test.cpp @@ -17,18 +17,21 @@ TEST(LlvmLibcStdcBitceilUcTest, Zero) { TEST(LlvmLibcStdcBitceilUcTest, Ones) { for (unsigned i = 0U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_bit_ceil_uc(1U << i), - static_cast(1U << i)); + EXPECT_EQ( + LIBC_NAMESPACE::stdc_bit_ceil_uc(static_cast(1U << i)), + static_cast(1U << i)); } TEST(LlvmLibcStdcBitceilUcTest, OneLessThanPowsTwo) { for (unsigned i = 2U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_bit_ceil_uc((1U << i) - 1), + EXPECT_EQ(LIBC_NAMESPACE::stdc_bit_ceil_uc( + static_cast((1U << i) - 1)), static_cast(1U << i)); } TEST(LlvmLibcStdcBitceilUcTest, OneMoreThanPowsTwo) { for (unsigned i = 1U; i != UCHAR_WIDTH - 1; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_bit_ceil_uc((1U << i) + 1), + EXPECT_EQ(LIBC_NAMESPACE::stdc_bit_ceil_uc( + static_cast((1U << i) + 1)), static_cast(1U << (i + 1))); } diff --git a/libc/test/src/stdbit/stdc_bit_ceil_us_test.cpp b/libc/test/src/stdbit/stdc_bit_ceil_us_test.cpp index 56873c51828f1..9a8b46f250f48 100644 --- a/libc/test/src/stdbit/stdc_bit_ceil_us_test.cpp +++ b/libc/test/src/stdbit/stdc_bit_ceil_us_test.cpp @@ -17,18 +17,21 @@ TEST(LlvmLibcStdcBitceilUsTest, Zero) { TEST(LlvmLibcStdcBitceilUsTest, Ones) { for (unsigned i = 0U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_bit_ceil_us(1U << i), - static_cast(1U << i)); + EXPECT_EQ( + LIBC_NAMESPACE::stdc_bit_ceil_us(static_cast(1U << i)), + static_cast(1U << i)); } TEST(LlvmLibcStdcBitceilUsTest, OneLessThanPowsTwo) { for (unsigned i = 2U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_bit_ceil_us((1U << i) - 1), + EXPECT_EQ(LIBC_NAMESPACE::stdc_bit_ceil_us( + static_cast((1U << i) - 1)), static_cast(1U << i)); } TEST(LlvmLibcStdcBitceilUsTest, OneMoreThanPowsTwo) { for (unsigned i = 1U; i != USHRT_WIDTH - 1; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_bit_ceil_us((1U << i) + 1), + EXPECT_EQ(LIBC_NAMESPACE::stdc_bit_ceil_us( + static_cast((1U << i) + 1)), static_cast(1U << (i + 1))); } diff --git a/libc/test/src/stdbit/stdc_first_leading_one_uc_test.cpp b/libc/test/src/stdbit/stdc_first_leading_one_uc_test.cpp index b8c8db587098e..2ab8397015288 100644 --- a/libc/test/src/stdbit/stdc_first_leading_one_uc_test.cpp +++ b/libc/test/src/stdbit/stdc_first_leading_one_uc_test.cpp @@ -16,6 +16,7 @@ TEST(LlvmLibcStdcFirstLeadingOneUcTest, Zero) { TEST(LlvmLibcStdcFirstLeadingOneUcTest, OneHot) { for (unsigned i = 0U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_one_uc(1U << i), + EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_one_uc( + static_cast(1U << i)), UCHAR_WIDTH - i); } diff --git a/libc/test/src/stdbit/stdc_first_leading_one_us_test.cpp b/libc/test/src/stdbit/stdc_first_leading_one_us_test.cpp index e9488335d9b00..de81275205424 100644 --- a/libc/test/src/stdbit/stdc_first_leading_one_us_test.cpp +++ b/libc/test/src/stdbit/stdc_first_leading_one_us_test.cpp @@ -16,6 +16,7 @@ TEST(LlvmLibcStdcFirstLeadingOneUsTest, Zero) { TEST(LlvmLibcStdcFirstLeadingOneUsTest, OneHot) { for (unsigned i = 0U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_one_us(1U << i), + EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_one_us( + static_cast(1U << i)), USHRT_WIDTH - i); } diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp index ac7e8c7d9e64b..a19d0ab83a9bd 100644 --- a/libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp +++ b/libc/test/src/stdbit/stdc_first_leading_zero_uc_test.cpp @@ -16,6 +16,7 @@ TEST(LlvmLibcStdcFirstLeadingZeroUcTest, ALL) { TEST(LlvmLibcStdcFirstLeadingZeroUcTest, ZeroHot) { for (unsigned i = 0U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_uc(~(1U << i)), + EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_uc( + static_cast(~(1U << i))), UCHAR_WIDTH - i); } diff --git a/libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp b/libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp index 37f8612675a7a..2971267f82a6f 100644 --- a/libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp +++ b/libc/test/src/stdbit/stdc_first_leading_zero_us_test.cpp @@ -16,6 +16,7 @@ TEST(LlvmLibcStdcFirstLeadingZeroUsTest, ALL) { TEST(LlvmLibcStdcFirstLeadingZeroUsTest, ZeroHot) { for (unsigned i = 0U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_us(~(1U << i)), + EXPECT_EQ(LIBC_NAMESPACE::stdc_first_leading_zero_us( + static_cast(~(1U << i))), USHRT_WIDTH - i); } diff --git a/libc/test/src/stdbit/stdc_first_trailing_one_uc_test.cpp b/libc/test/src/stdbit/stdc_first_trailing_one_uc_test.cpp index ed2b4921cdada..5ca4cfca1b479 100644 --- a/libc/test/src/stdbit/stdc_first_trailing_one_uc_test.cpp +++ b/libc/test/src/stdbit/stdc_first_trailing_one_uc_test.cpp @@ -16,5 +16,7 @@ TEST(LlvmLibcStdcFirstTrailingOneUcTest, ALL) { TEST(LlvmLibcStdcFirstTrailingOneUcTest, OneHot) { for (unsigned i = 0U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_first_trailing_one_uc(1U << i), i + 1); + EXPECT_EQ(LIBC_NAMESPACE::stdc_first_trailing_one_uc( + static_cast(1U << i)), + i + 1); } diff --git a/libc/test/src/stdbit/stdc_first_trailing_one_us_test.cpp b/libc/test/src/stdbit/stdc_first_trailing_one_us_test.cpp index 60021552310be..46c69acfb7f07 100644 --- a/libc/test/src/stdbit/stdc_first_trailing_one_us_test.cpp +++ b/libc/test/src/stdbit/stdc_first_trailing_one_us_test.cpp @@ -16,5 +16,7 @@ TEST(LlvmLibcStdcFirstTrailingOneUsTest, ALL) { TEST(LlvmLibcStdcFirstTrailingOneUsTest, OneHot) { for (unsigned i = 0U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_first_trailing_one_us(1U << i), i + 1); + EXPECT_EQ(LIBC_NAMESPACE::stdc_first_trailing_one_us( + static_cast(1U << i)), + i + 1); } diff --git a/libc/test/src/stdbit/stdc_first_trailing_zero_uc_test.cpp b/libc/test/src/stdbit/stdc_first_trailing_zero_uc_test.cpp index 2b17aa6536e66..9535ad9ffa3a2 100644 --- a/libc/test/src/stdbit/stdc_first_trailing_zero_uc_test.cpp +++ b/libc/test/src/stdbit/stdc_first_trailing_zero_uc_test.cpp @@ -16,5 +16,7 @@ TEST(LlvmLibcStdcFirstTrailingZeroUcTest, ALL) { TEST(LlvmLibcStdcFirstTrailingZeroUcTest, ZeroHot) { for (unsigned i = 0U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_first_trailing_zero_uc(~(1U << i)), i + 1); + EXPECT_EQ(LIBC_NAMESPACE::stdc_first_trailing_zero_uc( + static_cast(~(1U << i))), + i + 1); } diff --git a/libc/test/src/stdbit/stdc_first_trailing_zero_us_test.cpp b/libc/test/src/stdbit/stdc_first_trailing_zero_us_test.cpp index e370379300e4a..e0dc34fd89996 100644 --- a/libc/test/src/stdbit/stdc_first_trailing_zero_us_test.cpp +++ b/libc/test/src/stdbit/stdc_first_trailing_zero_us_test.cpp @@ -16,5 +16,7 @@ TEST(LlvmLibcStdcFirstTrailingZeroUsTest, ALL) { TEST(LlvmLibcStdcFirstTrailingZeroUsTest, ZeroHot) { for (unsigned i = 0U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_first_trailing_zero_us(~(1U << i)), i + 1); + EXPECT_EQ(LIBC_NAMESPACE::stdc_first_trailing_zero_us( + static_cast(~(1U << i))), + i + 1); } diff --git a/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp index 1bc189cf0b665..9dd2bdc12479d 100644 --- a/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp +++ b/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp @@ -16,5 +16,7 @@ TEST(LlvmLibcStdcHasSingleBitUcTest, Zero) { TEST(LlvmLibcStdcHasSingleBitUcTest, OneHot) { for (unsigned i = 0U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_uc(1U << i), true); + EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_uc( + static_cast(1U << i)), + true); } diff --git a/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp index a038f6fac0123..3ff0b83751ebf 100644 --- a/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp +++ b/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp @@ -16,5 +16,7 @@ TEST(LlvmLibcStdcHasSingleBitUsTest, Zero) { TEST(LlvmLibcStdcHasSingleBitUsTest, OneHot) { for (unsigned i = 0U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_us(1U << i), true); + EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_us( + static_cast(1U << i)), + true); } diff --git a/libc/test/src/stdbit/stdc_leading_ones_uc_test.cpp b/libc/test/src/stdbit/stdc_leading_ones_uc_test.cpp index 5d32d92e327a3..4ba240fdafad4 100644 --- a/libc/test/src/stdbit/stdc_leading_ones_uc_test.cpp +++ b/libc/test/src/stdbit/stdc_leading_ones_uc_test.cpp @@ -17,6 +17,7 @@ TEST(LlvmLibcStdcLeadingOnesUcTest, All) { TEST(LlvmLibcStdcLeadingOnesUcTest, ZeroHot) { for (unsigned i = 0U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_leading_ones_uc(~(1U << i)), + EXPECT_EQ(LIBC_NAMESPACE::stdc_leading_ones_uc( + static_cast(~(1U << i))), UCHAR_WIDTH - i - 1U); } diff --git a/libc/test/src/stdbit/stdc_leading_ones_us_test.cpp b/libc/test/src/stdbit/stdc_leading_ones_us_test.cpp index 91a125370ec15..0f93eed9e10b8 100644 --- a/libc/test/src/stdbit/stdc_leading_ones_us_test.cpp +++ b/libc/test/src/stdbit/stdc_leading_ones_us_test.cpp @@ -17,6 +17,7 @@ TEST(LlvmLibcStdcLeadingOnesUsTest, All) { TEST(LlvmLibcStdcLeadingOnesUsTest, ZeroHot) { for (unsigned i = 0U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_leading_ones_us(~(1U << i)), + EXPECT_EQ(LIBC_NAMESPACE::stdc_leading_ones_us( + static_cast(~(1U << i))), USHRT_WIDTH - i - 1U); } diff --git a/libc/test/src/stdbit/stdc_leading_zeros_uc_test.cpp b/libc/test/src/stdbit/stdc_leading_zeros_uc_test.cpp index 3d555072927ac..42f78c296fe09 100644 --- a/libc/test/src/stdbit/stdc_leading_zeros_uc_test.cpp +++ b/libc/test/src/stdbit/stdc_leading_zeros_uc_test.cpp @@ -17,6 +17,7 @@ TEST(LlvmLibcStdcLeadingZerosUcTest, Zero) { TEST(LlvmLibcStdcLeadingZerosUcTest, OneHot) { for (unsigned i = 0U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_leading_zeros_uc(1U << i), + EXPECT_EQ(LIBC_NAMESPACE::stdc_leading_zeros_uc( + static_cast(1U << i)), UCHAR_WIDTH - i - 1U); } diff --git a/libc/test/src/stdbit/stdc_leading_zeros_us_test.cpp b/libc/test/src/stdbit/stdc_leading_zeros_us_test.cpp index afb418a24ad54..967ceb13ff1d7 100644 --- a/libc/test/src/stdbit/stdc_leading_zeros_us_test.cpp +++ b/libc/test/src/stdbit/stdc_leading_zeros_us_test.cpp @@ -17,6 +17,7 @@ TEST(LlvmLibcStdcLeadingZerosUsTest, Zero) { TEST(LlvmLibcStdcLeadingZerosUsTest, OneHot) { for (unsigned i = 0U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_leading_zeros_us(1U << i), + EXPECT_EQ(LIBC_NAMESPACE::stdc_leading_zeros_us( + static_cast(1U << i)), USHRT_WIDTH - i - 1U); } diff --git a/libc/test/src/stdbit/stdc_trailing_ones_uc_test.cpp b/libc/test/src/stdbit/stdc_trailing_ones_uc_test.cpp index 79d4e5b8b8032..0036408513388 100644 --- a/libc/test/src/stdbit/stdc_trailing_ones_uc_test.cpp +++ b/libc/test/src/stdbit/stdc_trailing_ones_uc_test.cpp @@ -17,5 +17,7 @@ TEST(LlvmLibcStdcTrailingOnesUcTest, ALL) { TEST(LlvmLibcStdcTrailingOnesUcTest, ZeroHot) { for (unsigned i = 0U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_trailing_ones_uc(~(1U << i)), i); + EXPECT_EQ(LIBC_NAMESPACE::stdc_trailing_ones_uc( + static_cast(~(1U << i))), + i); } diff --git a/libc/test/src/stdbit/stdc_trailing_ones_us_test.cpp b/libc/test/src/stdbit/stdc_trailing_ones_us_test.cpp index 7ab15743ed1e0..5ebacc829c543 100644 --- a/libc/test/src/stdbit/stdc_trailing_ones_us_test.cpp +++ b/libc/test/src/stdbit/stdc_trailing_ones_us_test.cpp @@ -17,5 +17,7 @@ TEST(LlvmLibcStdcTrailingOnesUsTest, ALL) { TEST(LlvmLibcStdcTrailingOnesUsTest, ZeroHot) { for (unsigned i = 0U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_trailing_ones_us(~(1U << i)), i); + EXPECT_EQ(LIBC_NAMESPACE::stdc_trailing_ones_us( + static_cast(~(1U << i))), + i); } diff --git a/libc/test/src/stdbit/stdc_trailing_zeros_uc_test.cpp b/libc/test/src/stdbit/stdc_trailing_zeros_uc_test.cpp index c02b518865d9f..129ab38c45ea8 100644 --- a/libc/test/src/stdbit/stdc_trailing_zeros_uc_test.cpp +++ b/libc/test/src/stdbit/stdc_trailing_zeros_uc_test.cpp @@ -17,5 +17,7 @@ TEST(LlvmLibcStdcTrailingZerosUcTest, Zero) { TEST(LlvmLibcStdcTrailingZerosUcTest, OneHot) { for (unsigned i = 0U; i != UCHAR_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_trailing_zeros_uc(1U << i), i); + EXPECT_EQ(LIBC_NAMESPACE::stdc_trailing_zeros_uc( + static_cast(1U << i)), + i); } diff --git a/libc/test/src/stdbit/stdc_trailing_zeros_us_test.cpp b/libc/test/src/stdbit/stdc_trailing_zeros_us_test.cpp index a9f8327dfd914..e1171f24ccfda 100644 --- a/libc/test/src/stdbit/stdc_trailing_zeros_us_test.cpp +++ b/libc/test/src/stdbit/stdc_trailing_zeros_us_test.cpp @@ -17,5 +17,7 @@ TEST(LlvmLibcStdcTrailingZerosUsTest, Zero) { TEST(LlvmLibcStdcTrailingZerosUsTest, OneHot) { for (unsigned i = 0U; i != USHRT_WIDTH; ++i) - EXPECT_EQ(LIBC_NAMESPACE::stdc_trailing_zeros_us(1U << i), i); + EXPECT_EQ(LIBC_NAMESPACE::stdc_trailing_zeros_us( + static_cast(1U << i)), + i); } diff --git a/libc/test/src/stdfix/CMakeLists.txt b/libc/test/src/stdfix/CMakeLists.txt index e4d4fc5b52558..8f0226bf41672 100644 --- a/libc/test/src/stdfix/CMakeLists.txt +++ b/libc/test/src/stdfix/CMakeLists.txt @@ -73,6 +73,20 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk) libc.src.__support.CPP.bit libc.src.__support.fixed_point.fx_bits ) + + add_libc_test( + countls${suffix}_test + SUITE + libc-stdfix-tests + HDRS + CountlsTest.h + SRCS + countls${suffix}_test.cpp + DEPENDS + libc.src.stdfix.countls${suffix} + libc.src.__support.fixed_point.fx_rep + libc.src.__support.fixed_point.fx_bits + ) endforeach() add_libc_test( diff --git a/libc/test/src/stdfix/CountlsTest.h b/libc/test/src/stdfix/CountlsTest.h new file mode 100644 index 0000000000000..a8201acb455b5 --- /dev/null +++ b/libc/test/src/stdfix/CountlsTest.h @@ -0,0 +1,58 @@ +//===-- Utility class to test countls -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/Test.h" + +#include "src/__support/fixed_point/fx_rep.h" + +template class CountlsTest : public LIBC_NAMESPACE::testing::Test { + + using FXRep = LIBC_NAMESPACE::fixed_point::FXRep; + static constexpr T zero = FXRep::ZERO(); + static constexpr T max = FXRep::MAX(); + static constexpr T min = FXRep::MIN(); + static constexpr T one_half = FXRep::ONE_HALF(); + static constexpr T one_fourth = FXRep::ONE_FOURTH(); + static constexpr T eps = FXRep::EPS(); + +public: + typedef int (*CountlsFunc)(T); + + void testSpecialNumbers(CountlsFunc func) { + constexpr bool is_signed = (FXRep::SIGN_LEN > 0); + + EXPECT_EQ(FXRep::INTEGRAL_LEN, func(one_half)); + EXPECT_EQ(FXRep::INTEGRAL_LEN + 1, func(one_fourth)); + EXPECT_EQ(FXRep::VALUE_LEN, func(zero)); + EXPECT_EQ(FXRep::VALUE_LEN - 1, func(eps)); + EXPECT_EQ(0, func(max)); + // If signed, left shifting the minimum value will overflow, so countls = 0. + // If unsigned, the minimum value is zero, so countls is the number of value + // bits according to ISO/IEC TR 18037. + EXPECT_EQ(is_signed ? 0 : FXRep::VALUE_LEN, func(min)); + + if (10 <= static_cast(max)) + EXPECT_EQ(FXRep::INTEGRAL_LEN - 4, func(10)); + + if (static_cast(min) <= -10) + EXPECT_EQ(FXRep::INTEGRAL_LEN - 4, func(-10)); + + if constexpr (is_signed) { + EXPECT_EQ(FXRep::VALUE_LEN, func(-zero)); + EXPECT_EQ(FXRep::VALUE_LEN, func(-eps)); + EXPECT_EQ(FXRep::INTEGRAL_LEN + 1, func(-one_half)); + if (FXRep::FRACTION_LEN >= 2) + EXPECT_EQ(FXRep::INTEGRAL_LEN + 2, func(-one_fourth)); + } + } +}; + +#define LIST_COUNTLS_TESTS(T, func) \ + using LlvmLibcCountlsTest = CountlsTest; \ + TEST_F(LlvmLibcCountlsTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + static_assert(true, "Require semicolon.") diff --git a/libc/test/src/stdfix/countlshk_test.cpp b/libc/test/src/stdfix/countlshk_test.cpp new file mode 100644 index 0000000000000..659f869706b5f --- /dev/null +++ b/libc/test/src/stdfix/countlshk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlshk -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlshk.h" + +LIST_COUNTLS_TESTS(short accum, LIBC_NAMESPACE::countlshk); diff --git a/libc/test/src/stdfix/countlshr_test.cpp b/libc/test/src/stdfix/countlshr_test.cpp new file mode 100644 index 0000000000000..361d4acab3b11 --- /dev/null +++ b/libc/test/src/stdfix/countlshr_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlshr -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlshr.h" + +LIST_COUNTLS_TESTS(short fract, LIBC_NAMESPACE::countlshr); diff --git a/libc/test/src/stdfix/countlsk_test.cpp b/libc/test/src/stdfix/countlsk_test.cpp new file mode 100644 index 0000000000000..74cb519ec78de --- /dev/null +++ b/libc/test/src/stdfix/countlsk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsk --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsk.h" + +LIST_COUNTLS_TESTS(accum, LIBC_NAMESPACE::countlsk); diff --git a/libc/test/src/stdfix/countlslk_test.cpp b/libc/test/src/stdfix/countlslk_test.cpp new file mode 100644 index 0000000000000..006939db3c87e --- /dev/null +++ b/libc/test/src/stdfix/countlslk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlslk -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlslk.h" + +LIST_COUNTLS_TESTS(long accum, LIBC_NAMESPACE::countlslk); diff --git a/libc/test/src/stdfix/countlslr_test.cpp b/libc/test/src/stdfix/countlslr_test.cpp new file mode 100644 index 0000000000000..896cf9259c3ea --- /dev/null +++ b/libc/test/src/stdfix/countlslr_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlslr -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlslr.h" + +LIST_COUNTLS_TESTS(long fract, LIBC_NAMESPACE::countlslr); diff --git a/libc/test/src/stdfix/countlsr_test.cpp b/libc/test/src/stdfix/countlsr_test.cpp new file mode 100644 index 0000000000000..d7ae91ccd6a92 --- /dev/null +++ b/libc/test/src/stdfix/countlsr_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsr --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsr.h" + +LIST_COUNTLS_TESTS(fract, LIBC_NAMESPACE::countlsr); diff --git a/libc/test/src/stdfix/countlsuhk_test.cpp b/libc/test/src/stdfix/countlsuhk_test.cpp new file mode 100644 index 0000000000000..d8e68d65160e7 --- /dev/null +++ b/libc/test/src/stdfix/countlsuhk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsuhk ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsuhk.h" + +LIST_COUNTLS_TESTS(unsigned short accum, LIBC_NAMESPACE::countlsuhk); diff --git a/libc/test/src/stdfix/countlsuhr_test.cpp b/libc/test/src/stdfix/countlsuhr_test.cpp new file mode 100644 index 0000000000000..7dbc590d4a552 --- /dev/null +++ b/libc/test/src/stdfix/countlsuhr_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsuhr ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsuhr.h" + +LIST_COUNTLS_TESTS(unsigned short fract, LIBC_NAMESPACE::countlsuhr); diff --git a/libc/test/src/stdfix/countlsuk_test.cpp b/libc/test/src/stdfix/countlsuk_test.cpp new file mode 100644 index 0000000000000..20f78d8c942b6 --- /dev/null +++ b/libc/test/src/stdfix/countlsuk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsuk -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsuk.h" + +LIST_COUNTLS_TESTS(unsigned accum, LIBC_NAMESPACE::countlsuk); diff --git a/libc/test/src/stdfix/countlsulk_test.cpp b/libc/test/src/stdfix/countlsulk_test.cpp new file mode 100644 index 0000000000000..81ae208055cd9 --- /dev/null +++ b/libc/test/src/stdfix/countlsulk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsulk ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsulk.h" + +LIST_COUNTLS_TESTS(unsigned long accum, LIBC_NAMESPACE::countlsulk); diff --git a/libc/test/src/stdfix/countlsulr_test.cpp b/libc/test/src/stdfix/countlsulr_test.cpp new file mode 100644 index 0000000000000..5b9b047f7fd74 --- /dev/null +++ b/libc/test/src/stdfix/countlsulr_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsulr ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsulr.h" + +LIST_COUNTLS_TESTS(unsigned long fract, LIBC_NAMESPACE::countlsulr); diff --git a/libc/test/src/stdfix/countlsur_test.cpp b/libc/test/src/stdfix/countlsur_test.cpp new file mode 100644 index 0000000000000..67e32d7b56217 --- /dev/null +++ b/libc/test/src/stdfix/countlsur_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsur -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsur.h" + +LIST_COUNTLS_TESTS(unsigned fract, LIBC_NAMESPACE::countlsur); diff --git a/libc/utils/docgen/docgen.py b/libc/utils/docgen/docgen.py index 09db284ef9282..5a57987b3c51e 100755 --- a/libc/utils/docgen/docgen.py +++ b/libc/utils/docgen/docgen.py @@ -57,6 +57,7 @@ def check_api(header: Header, api: Dict): "c-definition", "in-latest-posix", "removed-in-posix-2008", + "removed-in-posix-2024", ] # Validate macros @@ -140,6 +141,10 @@ def print_functions_rst(header: Header, functions: Dict): print( f" - `removed in POSIX.1-2008 `__" ) + elif "removed-in-posix-2024" in functions[name]: + print( + f" - `removed in POSIX.1-2024 `__" + ) else: print(" -") diff --git a/libc/utils/docgen/sys/time.yaml b/libc/utils/docgen/sys/time.yaml index 1e3909adafea3..b7678f403cdcf 100644 --- a/libc/utils/docgen/sys/time.yaml +++ b/libc/utils/docgen/sys/time.yaml @@ -3,3 +3,9 @@ functions: in-latest-posix: '' utimes: in-latest-posix: '' + getitimer: + removed-in-posix-2024: '' + gettimeofday: + removed-in-posix-2024: '' + setitimer: + removed-in-posix-2024: '' diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 2978fadc2c29f..c88ea9700d100 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -247,11 +247,27 @@ add_custom_target( "generate_convert.cl" DEPENDS convert.cl ) set_target_properties( "generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) add_custom_command( - OUTPUT clspv-convert.cl - COMMAND ${Python3_EXECUTABLE} ${script_loc} --clspv > clspv-convert.cl + OUTPUT clc-convert.cl + COMMAND ${Python3_EXECUTABLE} ${script_loc} --clc > clc-convert.cl DEPENDS ${script_loc} ) -add_custom_target( "clspv-generate_convert.cl" DEPENDS clspv-convert.cl ) -set_target_properties( "clspv-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) +add_custom_target( "clc-generate_convert.cl" DEPENDS clc-convert.cl ) +set_target_properties( "clc-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) + +if ( clspv-- IN_LIST LIBCLC_TARGETS_TO_BUILD OR clspv64-- IN_LIST LIBCLC_TARGETS_TO_BUILD ) + add_custom_command( + OUTPUT clspv-convert.cl + COMMAND ${Python3_EXECUTABLE} ${script_loc} --clspv > clspv-convert.cl + DEPENDS ${script_loc} ) + add_custom_target( "clspv-generate_convert.cl" DEPENDS clspv-convert.cl ) + set_target_properties( "clspv-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) + + add_custom_command( + OUTPUT clc-clspv-convert.cl + COMMAND ${Python3_EXECUTABLE} ${script_loc} --clc --clspv > clc-clspv-convert.cl + DEPENDS ${script_loc} ) + add_custom_target( "clc-clspv-generate_convert.cl" DEPENDS clc-clspv-convert.cl ) + set_target_properties( "clc-clspv-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) +endif() enable_testing() @@ -289,6 +305,12 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( clc_lib_files ) set( clc_dirs ${dirs} generic ) + if( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 ) + set( clc_gen_files clc-clspv-convert.cl ) + else() + set( clc_gen_files clc-convert.cl ) + endif() + libclc_configure_lib_source( clc_lib_files CLC_INTERNAL @@ -372,6 +394,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) COMPILE_FLAGS ${clc_build_flags} OPT_FLAGS ${opt_flags} LIB_FILES ${clc_lib_files} + GEN_FILES ${clc_gen_files} ) list( APPEND build_flags diff --git a/libclc/clc/include/clc/clc_convert.h b/libclc/clc/include/clc/clc_convert.h new file mode 100644 index 0000000000000..20bbd57540b30 --- /dev/null +++ b/libclc/clc/include/clc/clc_convert.h @@ -0,0 +1,98 @@ +#ifndef __CLC_CLC_CONVERT_H__ +#define __CLC_CLC_CONVERT_H__ + +#define _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \ + _CLC_OVERLOAD _CLC_DECL TO_TYPE __clc_convert_##TO_TYPE##SUFFIX(FROM_TYPE x); + +#define _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE##2, TO_TYPE##2, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE##3, TO_TYPE##3, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE##4, TO_TYPE##4, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE##8, TO_TYPE##8, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE##16, TO_TYPE##16, SUFFIX) + +#define _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, char, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, uchar, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, int, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, uint, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, short, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ushort, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, long, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ulong, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, float, SUFFIX) + +#if defined(cl_khr_fp64) && defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, double, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, half, SUFFIX) +#elif defined(cl_khr_fp64) +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, double, SUFFIX) +#elif defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, half, SUFFIX) +#else +#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) +#endif + +#define _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(char, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(uchar, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(int, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(uint, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(short, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(ushort, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(long, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(ulong, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(float, SUFFIX) + +#if defined(cl_khr_fp64) && defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define _CLC_VECTOR_CONVERT_TO(SUFFIX) \ + _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(double, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(half, SUFFIX) +#elif defined(cl_khr_fp64) +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define _CLC_VECTOR_CONVERT_TO(SUFFIX) \ + _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(double, SUFFIX) +#elif defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define _CLC_VECTOR_CONVERT_TO(SUFFIX) \ + _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(half, SUFFIX) +#else +#define _CLC_VECTOR_CONVERT_TO(SUFFIX) _CLC_VECTOR_CONVERT_TO1(SUFFIX) +#endif + +#define _CLC_VECTOR_CONVERT_TO_SUFFIX(ROUND) \ + _CLC_VECTOR_CONVERT_TO(_sat##ROUND) \ + _CLC_VECTOR_CONVERT_TO(ROUND) + +_CLC_VECTOR_CONVERT_TO_SUFFIX(_rtn) +_CLC_VECTOR_CONVERT_TO_SUFFIX(_rte) +_CLC_VECTOR_CONVERT_TO_SUFFIX(_rtz) +_CLC_VECTOR_CONVERT_TO_SUFFIX(_rtp) +_CLC_VECTOR_CONVERT_TO_SUFFIX() + +#undef _CLC_VECTOR_CONVERT_TO_SUFFIX +#undef _CLC_VECTOR_CONVERT_TO +#undef _CLC_VECTOR_CONVERT_TO1 +#undef _CLC_VECTOR_CONVERT_FROM +#undef _CLC_VECTOR_CONVERT_FROM1 +#undef _CLC_VECTOR_CONVERT_DECL +#undef _CLC_CONVERT_DECL + +#endif // __CLC_CLC_CONVERT_H__ diff --git a/libclc/clc/include/clc/float/definitions.h b/libclc/clc/include/clc/float/definitions.h new file mode 100644 index 0000000000000..6fea19a1cae98 --- /dev/null +++ b/libclc/clc/include/clc/float/definitions.h @@ -0,0 +1,88 @@ +#define MAXFLOAT 0x1.fffffep127f +#define HUGE_VALF __builtin_huge_valf() +#define INFINITY __builtin_inff() +#define NAN __builtin_nanf("") + +#define FLT_DIG 6 +#define FLT_MANT_DIG 24 +#define FLT_MAX_10_EXP +38 +#define FLT_MAX_EXP +128 +#define FLT_MIN_10_EXP -37 +#define FLT_MIN_EXP -125 +#define FLT_RADIX 2 +#define FLT_MAX MAXFLOAT +#define FLT_MIN 0x1.0p-126f +#define FLT_EPSILON 0x1.0p-23f + +#define FP_ILOGB0 (-2147483647 - 1) +#define FP_ILOGBNAN 2147483647 + +#define M_E_F 0x1.5bf0a8p+1f +#define M_LOG2E_F 0x1.715476p+0f +#define M_LOG10E_F 0x1.bcb7b2p-2f +#define M_LN2_F 0x1.62e430p-1f +#define M_LN10_F 0x1.26bb1cp+1f +#define M_PI_F 0x1.921fb6p+1f +#define M_PI_2_F 0x1.921fb6p+0f +#define M_PI_4_F 0x1.921fb6p-1f +#define M_1_PI_F 0x1.45f306p-2f +#define M_2_PI_F 0x1.45f306p-1f +#define M_2_SQRTPI_F 0x1.20dd76p+0f +#define M_SQRT2_F 0x1.6a09e6p+0f +#define M_SQRT1_2_F 0x1.6a09e6p-1f + +#define M_LOG210_F 0x1.a934f0p+1f + +#ifdef cl_khr_fp64 + +#define HUGE_VAL __builtin_huge_val() + +#define DBL_DIG 15 +#define DBL_MANT_DIG 53 +#define DBL_MAX_10_EXP +308 +#define DBL_MAX_EXP +1024 +#define DBL_MIN_10_EXP -307 +#define DBL_MIN_EXP -1021 +#define DBL_MAX 0x1.fffffffffffffp1023 +#define DBL_MIN 0x1.0p-1022 +#define DBL_EPSILON 0x1.0p-52 + +#define M_E 0x1.5bf0a8b145769p+1 +#define M_LOG2E 0x1.71547652b82fep+0 +#define M_LOG10E 0x1.bcb7b1526e50ep-2 +#define M_LN2 0x1.62e42fefa39efp-1 +#define M_LN10 0x1.26bb1bbb55516p+1 +#define M_PI 0x1.921fb54442d18p+1 +#define M_PI_2 0x1.921fb54442d18p+0 +#define M_PI_4 0x1.921fb54442d18p-1 +#define M_1_PI 0x1.45f306dc9c883p-2 +#define M_2_PI 0x1.45f306dc9c883p-1 +#define M_2_SQRTPI 0x1.20dd750429b6dp+0 +#define M_SQRT2 0x1.6a09e667f3bcdp+0 +#define M_SQRT1_2 0x1.6a09e667f3bcdp-1 + +#ifdef __CLC_INTERNAL +#define M_LOG210 0x1.a934f0979a371p+1 +#endif + +#endif + +#ifdef cl_khr_fp16 + +#if __OPENCL_VERSION__ >= 120 + +#define HALF_DIG 3 +#define HALF_MANT_DIG 11 +#define HALF_MAX_10_EXP +4 +#define HALF_MAX_EXP +16 +#define HALF_MIN_10_EXP -4 +#define HALF_MIN_EXP -13 + +#define HALF_RADIX 2 +#define HALF_MAX 0x1.ffcp15h +#define HALF_MIN 0x1.0p-14h +#define HALF_EPSILON 0x1.0p-10h + +#endif + +#endif diff --git a/libclc/generic/include/clc/convert.h b/libclc/generic/include/clc/convert.h index 8219df47ad2c6..4e96c874b5cd8 100644 --- a/libclc/generic/include/clc/convert.h +++ b/libclc/generic/include/clc/convert.h @@ -1,74 +1,81 @@ -#define _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \ +#define _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \ _CLC_OVERLOAD _CLC_DECL TO_TYPE convert_##TO_TYPE##SUFFIX(FROM_TYPE x); -#define _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \ - _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \ - _CLC_CONVERT_DECL(FROM_TYPE##2, TO_TYPE##2, SUFFIX) \ - _CLC_CONVERT_DECL(FROM_TYPE##3, TO_TYPE##3, SUFFIX) \ - _CLC_CONVERT_DECL(FROM_TYPE##4, TO_TYPE##4, SUFFIX) \ - _CLC_CONVERT_DECL(FROM_TYPE##8, TO_TYPE##8, SUFFIX) \ +#define _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE##2, TO_TYPE##2, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE##3, TO_TYPE##3, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE##4, TO_TYPE##4, SUFFIX) \ + _CLC_CONVERT_DECL(FROM_TYPE##8, TO_TYPE##8, SUFFIX) \ _CLC_CONVERT_DECL(FROM_TYPE##16, TO_TYPE##16, SUFFIX) -#define _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \ - _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, char, SUFFIX) \ - _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, uchar, SUFFIX) \ - _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, int, SUFFIX) \ - _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, uint, SUFFIX) \ - _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, short, SUFFIX) \ - _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ushort, SUFFIX) \ - _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, long, SUFFIX) \ - _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ulong, SUFFIX) \ +#define _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, char, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, uchar, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, int, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, uint, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, short, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ushort, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, long, SUFFIX) \ + _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ulong, SUFFIX) \ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, float, SUFFIX) #if defined(cl_khr_fp64) && defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_fp64 : enable #define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \ _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, double, SUFFIX) \ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, half, SUFFIX) #elif defined(cl_khr_fp64) -#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \ - _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, double, SUFFIX) #elif defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \ _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, half, SUFFIX) #else -#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \ +#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \ _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) #endif -#define _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ - _CLC_VECTOR_CONVERT_FROM(char, SUFFIX) \ - _CLC_VECTOR_CONVERT_FROM(uchar, SUFFIX) \ - _CLC_VECTOR_CONVERT_FROM(int, SUFFIX) \ - _CLC_VECTOR_CONVERT_FROM(uint, SUFFIX) \ - _CLC_VECTOR_CONVERT_FROM(short, SUFFIX) \ - _CLC_VECTOR_CONVERT_FROM(ushort, SUFFIX) \ - _CLC_VECTOR_CONVERT_FROM(long, SUFFIX) \ - _CLC_VECTOR_CONVERT_FROM(ulong, SUFFIX) \ +#define _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(char, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(uchar, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(int, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(uint, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(short, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(ushort, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(long, SUFFIX) \ + _CLC_VECTOR_CONVERT_FROM(ulong, SUFFIX) \ _CLC_VECTOR_CONVERT_FROM(float, SUFFIX) #if defined(cl_khr_fp64) && defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_fp64 : enable #define _CLC_VECTOR_CONVERT_TO(SUFFIX) \ _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ _CLC_VECTOR_CONVERT_FROM(double, SUFFIX) \ _CLC_VECTOR_CONVERT_FROM(half, SUFFIX) #elif defined(cl_khr_fp64) -#define _CLC_VECTOR_CONVERT_TO(SUFFIX) \ - _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define _CLC_VECTOR_CONVERT_TO(SUFFIX) \ + _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ _CLC_VECTOR_CONVERT_FROM(double, SUFFIX) #elif defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _CLC_VECTOR_CONVERT_TO(SUFFIX) \ _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ _CLC_VECTOR_CONVERT_FROM(half, SUFFIX) #else -#define _CLC_VECTOR_CONVERT_TO(SUFFIX) \ - _CLC_VECTOR_CONVERT_TO1(SUFFIX) +#define _CLC_VECTOR_CONVERT_TO(SUFFIX) _CLC_VECTOR_CONVERT_TO1(SUFFIX) #endif -#define _CLC_VECTOR_CONVERT_TO_SUFFIX(ROUND) \ - _CLC_VECTOR_CONVERT_TO(_sat##ROUND) \ +#define _CLC_VECTOR_CONVERT_TO_SUFFIX(ROUND) \ + _CLC_VECTOR_CONVERT_TO(_sat##ROUND) \ _CLC_VECTOR_CONVERT_TO(ROUND) _CLC_VECTOR_CONVERT_TO_SUFFIX(_rtn) @@ -76,3 +83,11 @@ _CLC_VECTOR_CONVERT_TO_SUFFIX(_rte) _CLC_VECTOR_CONVERT_TO_SUFFIX(_rtz) _CLC_VECTOR_CONVERT_TO_SUFFIX(_rtp) _CLC_VECTOR_CONVERT_TO_SUFFIX() + +#undef _CLC_VECTOR_CONVERT_TO_SUFFIX +#undef _CLC_VECTOR_CONVERT_TO +#undef _CLC_VECTOR_CONVERT_TO1 +#undef _CLC_VECTOR_CONVERT_FROM +#undef _CLC_VECTOR_CONVERT_FROM1 +#undef _CLC_VECTOR_CONVERT_DECL +#undef _CLC_CONVERT_DECL diff --git a/libclc/generic/include/clc/float/definitions.h b/libclc/generic/include/clc/float/definitions.h deleted file mode 100644 index be3d0130f3e61..0000000000000 --- a/libclc/generic/include/clc/float/definitions.h +++ /dev/null @@ -1,88 +0,0 @@ -#define MAXFLOAT 0x1.fffffep127f -#define HUGE_VALF __builtin_huge_valf() -#define INFINITY __builtin_inff() -#define NAN __builtin_nanf("") - -#define FLT_DIG 6 -#define FLT_MANT_DIG 24 -#define FLT_MAX_10_EXP +38 -#define FLT_MAX_EXP +128 -#define FLT_MIN_10_EXP -37 -#define FLT_MIN_EXP -125 -#define FLT_RADIX 2 -#define FLT_MAX MAXFLOAT -#define FLT_MIN 0x1.0p-126f -#define FLT_EPSILON 0x1.0p-23f - -#define FP_ILOGB0 (-2147483647 - 1) -#define FP_ILOGBNAN 2147483647 - -#define M_E_F 0x1.5bf0a8p+1f -#define M_LOG2E_F 0x1.715476p+0f -#define M_LOG10E_F 0x1.bcb7b2p-2f -#define M_LN2_F 0x1.62e430p-1f -#define M_LN10_F 0x1.26bb1cp+1f -#define M_PI_F 0x1.921fb6p+1f -#define M_PI_2_F 0x1.921fb6p+0f -#define M_PI_4_F 0x1.921fb6p-1f -#define M_1_PI_F 0x1.45f306p-2f -#define M_2_PI_F 0x1.45f306p-1f -#define M_2_SQRTPI_F 0x1.20dd76p+0f -#define M_SQRT2_F 0x1.6a09e6p+0f -#define M_SQRT1_2_F 0x1.6a09e6p-1f - -#define M_LOG210_F 0x1.a934f0p+1f - -#ifdef cl_khr_fp64 - -#define HUGE_VAL __builtin_huge_val() - -#define DBL_DIG 15 -#define DBL_MANT_DIG 53 -#define DBL_MAX_10_EXP +308 -#define DBL_MAX_EXP +1024 -#define DBL_MIN_10_EXP -307 -#define DBL_MIN_EXP -1021 -#define DBL_MAX 0x1.fffffffffffffp1023 -#define DBL_MIN 0x1.0p-1022 -#define DBL_EPSILON 0x1.0p-52 - -#define M_E 0x1.5bf0a8b145769p+1 -#define M_LOG2E 0x1.71547652b82fep+0 -#define M_LOG10E 0x1.bcb7b1526e50ep-2 -#define M_LN2 0x1.62e42fefa39efp-1 -#define M_LN10 0x1.26bb1bbb55516p+1 -#define M_PI 0x1.921fb54442d18p+1 -#define M_PI_2 0x1.921fb54442d18p+0 -#define M_PI_4 0x1.921fb54442d18p-1 -#define M_1_PI 0x1.45f306dc9c883p-2 -#define M_2_PI 0x1.45f306dc9c883p-1 -#define M_2_SQRTPI 0x1.20dd750429b6dp+0 -#define M_SQRT2 0x1.6a09e667f3bcdp+0 -#define M_SQRT1_2 0x1.6a09e667f3bcdp-1 - -#ifdef __CLC_INTERNAL -#define M_LOG210 0x1.a934f0979a371p+1 -#endif - -#endif - -#ifdef cl_khr_fp16 - -#if __OPENCL_VERSION__ >= 120 - -#define HALF_DIG 3 -#define HALF_MANT_DIG 11 -#define HALF_MAX_10_EXP +4 -#define HALF_MAX_EXP +16 -#define HALF_MIN_10_EXP -4 -#define HALF_MIN_EXP -13 - -#define HALF_RADIX 2 -#define HALF_MAX 0x1.ffcp15h -#define HALF_MIN 0x1.0p-14h -#define HALF_EPSILON 0x1.0p-10h - -#endif - -#endif diff --git a/libclc/generic/lib/gen_convert.py b/libclc/generic/lib/gen_convert.py index d2f69e602a29d..92f32694a52af 100644 --- a/libclc/generic/lib/gen_convert.py +++ b/libclc/generic/lib/gen_convert.py @@ -30,11 +30,15 @@ import argparse parser = argparse.ArgumentParser() +parser.add_argument( + "--clc", action="store_true", help="Generate clc internal conversions" +) parser.add_argument( "--clspv", action="store_true", help="Generate the clspv variant of the code" ) args = parser.parse_args() +clc = args.clc clspv = args.clspv types = [ @@ -158,8 +162,32 @@ def conditional_guard(src, dst): return False +nl = "\n" +includes = [] +if not clc: + includes = [""] +else: + includes = sorted( + [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + ) + print( - """/* !!!! AUTOGENERATED FILE generated by convert_type.py !!!!! + f"""/* !!!! AUTOGENERATED FILE generated by convert_type.py !!!!! DON'T CHANGE THIS FILE. MAKE YOUR CHANGES TO convert_type.py AND RUN: $ ./generate-conversion-type-cl.sh @@ -188,7 +216,8 @@ def conditional_guard(src, dst): THE SOFTWARE. */ -#include +{nl.join(['#include ' + f for f in includes])} +#include #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable @@ -210,6 +239,7 @@ def conditional_guard(src, dst): """ ) + # # Default Conversions # @@ -236,6 +266,13 @@ def conditional_guard(src, dst): # even for integer-to-integer conversions. When such a conversion # is used, the rounding mode is ignored. # +def print_passthru_conversion(src_ty, dst_ty, fn_name): + print( + f"""_CLC_DEF _CLC_OVERLOAD {dst_ty} {fn_name}({src_ty} x) {{ + return __clc_{fn_name}(x); +}} +""" + ) def generate_default_conversion(src, dst, mode): @@ -243,26 +280,35 @@ def generate_default_conversion(src, dst, mode): for size in vector_sizes: if not size: - print( - f"""_CLC_DEF _CLC_OVERLOAD {dst} convert_{dst}{mode}({src} x) {{ + if clc: + print( + f"""_CLC_DEF _CLC_OVERLOAD {dst} __clc_convert_{dst}{mode}({src} x) {{ return ({dst})x; }} """ - ) + ) + else: + print_passthru_conversion(src, dst, f"convert_{dst}{mode}") else: - print( - f"""_CLC_DEF _CLC_OVERLOAD {dst}{size} convert_{dst}{size}{mode}({src}{size} x) {{ + if clc: + print( + f"""_CLC_DEF _CLC_OVERLOAD {dst}{size} __clc_convert_{dst}{size}{mode}({src}{size} x) {{ return __builtin_convertvector(x, {dst}{size}); }} """ - ) + ) + else: + print_passthru_conversion( + f"{src}{size}", f"{dst}{size}", f"convert_{dst}{size}{mode}" + ) if close_conditional: print("#endif") -# Do not generate default conversion for clspv as they are handled natively -if not clspv: +# Do not generate user-facing default conversions for clspv as they are handled +# natively +if clc or not clspv: for src in types: for dst in types: generate_default_conversion(src, dst, "") @@ -270,15 +316,16 @@ def generate_default_conversion(src, dst, mode): for src in int_types: for dst in int_types: for mode in rounding_modes: - # Do not generate "_rte" conversion for clspv as they are handled - # natively - if clspv and mode == "_rte": + # Do not generate user-facing "_rte" conversions for clspv as they + # are handled natively + if clspv and not clc and mode == "_rte": continue generate_default_conversion(src, dst, mode) # # Saturated Conversions To Integers -# + + # These functions are dependent on the unsaturated conversion functions # generated above, and use clamp, max, min, and select to eliminate # branching and vectorize the conversions. @@ -286,35 +333,37 @@ def generate_default_conversion(src, dst, mode): # Again, as above, we allow all rounding modes for integer-to-integer # conversions with saturation. # - - def generate_saturated_conversion(src, dst, size): # Header close_conditional = conditional_guard(src, dst) - print( - """_CLC_DEF _CLC_OVERLOAD -{DST}{N} convert_{DST}{N}_sat({SRC}{N} x) -{{""".format( - DST=dst, SRC=src, N=size - ) - ) - # FIXME: This is a work around for lack of select function with - # signed third argument when the first two arguments are unsigned types. - # We cast to the signed type for sign-extension, then do a bitcast to - # the unsigned type. + dstn = f"{dst}{size}" + srcn = f"{src}{size}" + + if not clc: + print_passthru_conversion(f"{srcn}", f"{dstn}", f"convert_{dstn}_sat") + if close_conditional: + print("#endif") + return + + print(f"_CLC_DEF _CLC_OVERLOAD {dstn} __clc_convert_{dstn}_sat({srcn} x) {{") + + # FIXME: This is a work around for lack of select function with signed + # third argument when the first two arguments are unsigned types. We cast + # to the signed type for sign-extension, then do a bitcast to the unsigned + # type. if dst in unsigned_types: - bool_prefix = "as_{DST}{N}(convert_{BOOL}{N}".format( - DST=dst, BOOL=bool_type[dst], N=size - ) + bool_prefix = f"__clc_as_{dstn}(__clc_convert_{bool_type[dst]}{size}" bool_suffix = ")" else: - bool_prefix = "convert_{BOOL}{N}".format(BOOL=bool_type[dst], N=size) + bool_prefix = f"__clc_convert_{bool_type[dst]}{size}" bool_suffix = "" + dst_max = limit_max[dst] + dst_min = limit_min[dst] + # Body if src == dst: - # Conversion between same types print(" return x;") @@ -323,69 +372,40 @@ def generate_saturated_conversion(src, dst, size): if clspv: # Conversion from float to int print( - """ {DST}{N} y = convert_{DST}{N}(x); - y = select(y, ({DST}{N}){DST_MIN}, {BP}(x <= ({SRC}{N}){DST_MIN}){BS}); - y = select(y, ({DST}{N}){DST_MAX}, {BP}(x >= ({SRC}{N}){DST_MAX}){BS}); - return y;""".format( - SRC=src, - DST=dst, - N=size, - DST_MIN=limit_min[dst], - DST_MAX=limit_max[dst], - BP=bool_prefix, - BS=bool_suffix, - ) + f""" {dstn} y = __clc_convert_{dstn}(x); + y = __clc_select(y, ({dstn}){dst_min}, {bool_prefix}(x <= ({srcn}){dst_min}){bool_suffix}); + y = __clc_select(y, ({dstn}){dst_max}, {bool_prefix}(x >= ({srcn}){dst_max}){bool_suffix}); + return y;""" ) else: # Conversion from float to int print( - """ {DST}{N} y = convert_{DST}{N}(x); - y = select(y, ({DST}{N}){DST_MIN}, {BP}(x < ({SRC}{N}){DST_MIN}){BS}); - y = select(y, ({DST}{N}){DST_MAX}, {BP}(x > ({SRC}{N}){DST_MAX}){BS}); - return y;""".format( - SRC=src, - DST=dst, - N=size, - DST_MIN=limit_min[dst], - DST_MAX=limit_max[dst], - BP=bool_prefix, - BS=bool_suffix, - ) + f""" {dstn} y = __clc_convert_{dstn}(x); + y = __clc_select(y, ({dstn}){dst_min}, {bool_prefix}(x < ({srcn}){dst_min}){bool_suffix}); + y = __clc_select(y, ({dstn}){dst_max}, {bool_prefix}(x > ({srcn}){dst_max}){bool_suffix}); + return y;""" ) - else: # Integer to integer convesion with sizeof(src) == sizeof(dst) if sizeof_type[src] == sizeof_type[dst]: if src in unsigned_types: - print( - " x = min(x, ({SRC}){DST_MAX});".format( - SRC=src, DST_MAX=limit_max[dst] - ) - ) + print(f" x = __clc_min(x, ({src}){dst_max});") else: - print(" x = max(x, ({SRC})0);".format(SRC=src)) + print(f" x = __clc_max(x, ({src})0);") # Integer to integer conversion where sizeof(src) > sizeof(dst) elif sizeof_type[src] > sizeof_type[dst]: if src in unsigned_types: - print( - " x = min(x, ({SRC}){DST_MAX});".format( - SRC=src, DST_MAX=limit_max[dst] - ) - ) + print(f" x = __clc_min(x, ({src}){dst_max});") else: - print( - " x = clamp(x, ({SRC}){DST_MIN}, ({SRC}){DST_MAX});".format( - SRC=src, DST_MIN=limit_min[dst], DST_MAX=limit_max[dst] - ) - ) + print(f" x = __clc_clamp(x, ({src}){dst_min}, ({src}){dst_max});") # Integer to integer conversion where sizeof(src) < sizeof(dst) elif src not in unsigned_types and dst in unsigned_types: - print(" x = max(x, ({SRC})0);".format(SRC=src)) + print(f" x = __clc_max(x, ({src})0);") - print(" return convert_{DST}{N}(x);".format(DST=dst, N=size)) + print(f" return __clc_convert_{dstn}(x);") # Footer print("}") @@ -403,17 +423,19 @@ def generate_saturated_conversion_with_rounding(src, dst, size, mode): # Header close_conditional = conditional_guard(src, dst) - # Body - print( - """_CLC_DEF _CLC_OVERLOAD -{DST}{N} convert_{DST}{N}_sat{M}({SRC}{N} x) -{{ - return convert_{DST}{N}_sat(x); + dstn = f"{dst}{size}" + srcn = f"{src}{size}" + + if not clc: + print_passthru_conversion(f"{srcn}", f"{dstn}", f"convert_{dstn}_sat{mode}") + else: + # Body + print( + f"""_CLC_DEF _CLC_OVERLOAD {dstn} __clc_convert_{dstn}_sat{mode}({srcn} x) {{ + return __clc_convert_{dstn}_sat(x); }} -""".format( - DST=dst, SRC=src, N=size, M=mode +""" ) - ) # Footer if close_conditional: @@ -426,6 +448,7 @@ def generate_saturated_conversion_with_rounding(src, dst, size, mode): for mode in rounding_modes: generate_saturated_conversion_with_rounding(src, dst, size, mode) + # # Conversions To/From Floating-Point With Rounding # @@ -439,134 +462,90 @@ def generate_saturated_conversion_with_rounding(src, dst, size, mode): # # Only conversions to integers can have saturation. # - - def generate_float_conversion(src, dst, size, mode, sat): # Header close_conditional = conditional_guard(src, dst) - print( - """_CLC_DEF _CLC_OVERLOAD -{DST}{N} convert_{DST}{N}{S}{M}({SRC}{N} x) -{{""".format( - SRC=src, DST=dst, N=size, M=mode, S=sat - ) - ) + + dstn = f"{dst}{size}" + srcn = f"{src}{size}" + booln = f"{bool_type[dst]}{size}" + src_max = limit_max[src] if src in limit_max else "" + dst_min = limit_min[dst] if dst in limit_min else "" + + if not clc: + print_passthru_conversion(f"{srcn}", f"{dstn}", f"convert_{dstn}{sat}{mode}") + # Footer + if close_conditional: + print("#endif") + return + + print(f"_CLC_DEF _CLC_OVERLOAD {dstn} __clc_convert_{dstn}{sat}{mode}({srcn} x) {{") # Perform conversion if dst in int_types: if mode == "_rte": - print(" x = rint(x);") + print(" x = __clc_rint(x);") elif mode == "_rtp": - print(" x = ceil(x);") + print(" x = __clc_ceil(x);") elif mode == "_rtn": - print(" x = floor(x);") - print(" return convert_{DST}{N}{S}(x);".format(DST=dst, N=size, S=sat)) + print(" x = __clc_floor(x);") + print(f" return __clc_convert_{dstn}{sat}(x);") elif mode == "_rte": - print(" return convert_{DST}{N}(x);".format(DST=dst, N=size)) + print(f" return __clc_convert_{dstn}(x);") else: - print(" {DST}{N} r = convert_{DST}{N}(x);".format(DST=dst, N=size)) + print(f" {dstn} r = __clc_convert_{dstn}(x);") if clspv: - print(" {SRC}{N} y = convert_{SRC}{N}_sat(r);".format(SRC=src, N=size)) + print(f" {srcn} y = __clc_convert_{srcn}_sat(r);") else: - print(" {SRC}{N} y = convert_{SRC}{N}(r);".format(SRC=src, N=size)) + print(f" {srcn} y = __clc_convert_{srcn}(r);") if mode == "_rtz": if src in int_types: - print( - " {USRC}{N} abs_x = abs(x);".format( - USRC=unsigned_type[src], N=size - ) - ) - print( - " {USRC}{N} abs_y = abs(y);".format( - USRC=unsigned_type[src], N=size - ) - ) - else: - print(" {SRC}{N} abs_x = fabs(x);".format(SRC=src, N=size)) - print(" {SRC}{N} abs_y = fabs(y);".format(SRC=src, N=size)) - if clspv: - print( - " {BOOL}{N} c = convert_{BOOL}{N}(abs_y > abs_x);".format( - BOOL=bool_type[dst], N=size - ) - ) - if sizeof_type[src] >= 4 and src in int_types: - print( - " c = c || convert_{BOOL}{N}(({SRC}{N}){SRC_MAX} == x);".format( - BOOL=bool_type[dst], N=size, SRC=src, SRC_MAX=limit_max[src] - ) - ) - print( - " {DST}{N} sel = select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), c);".format( - DST=dst, N=size, BOOL=bool_type[dst], SRC=src - ) - ) + usrcn = f"{unsigned_type[src]}{size}" + print(f" {usrcn} abs_x = __clc_abs(x);") + print(f" {usrcn} abs_y = __clc_abs(y);") else: - print( - " {DST}{N} sel = select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), convert_{BOOL}{N}(abs_y > abs_x));".format( - DST=dst, N=size, BOOL=bool_type[dst] - ) - ) + print(f" {srcn} abs_x = __clc_fabs(x);") + print(f" {srcn} abs_y = __clc_fabs(y);") + print(f" {booln} c = __clc_convert_{booln}(abs_y > abs_x);") + if clspv and sizeof_type[src] >= 4 and src in int_types: + print(f" c = c || __clc_convert_{booln}(({srcn}){src_max} == x);") + print( + f" {dstn} sel = __clc_select(r, __clc_nextafter(r, __clc_sign(r) * ({dstn})-INFINITY), c);" + ) if dst == "half" and src in int_types and sizeof_type[src] >= 2: dst_max = limit_max[dst] - # short is 16 bits signed, so the maximum value rounded to zero is 0x1.ffcp+14 (0x1p+15 == 32768 > 0x7fff == 32767) + # short is 16 bits signed, so the maximum value rounded to zero + # is 0x1.ffcp+14 (0x1p+15 == 32768 > 0x7fff == 32767) if src == "short": dst_max = "0x1.ffcp+14" print( - " return clamp(sel, ({DST}{N}){DST_MIN}, ({DST}{N}){DST_MAX});".format( - DST=dst, N=size, DST_MIN=limit_min[dst], DST_MAX=dst_max - ) + f" return __clc_clamp(sel, ({dstn}){dst_min}, ({dstn}){dst_max});" ) else: print(" return sel;") if mode == "_rtp": print( - " {DST}{N} sel = select(r, nextafter(r, ({DST}{N})INFINITY), convert_{BOOL}{N}(y < x));".format( - DST=dst, N=size, BOOL=bool_type[dst] - ) + f" {dstn} sel = __clc_select(r, __clc_nextafter(r, ({dstn})INFINITY), __clc_convert_{booln}(y < x));" ) if dst == "half" and src in int_types and sizeof_type[src] >= 2: - print( - " return max(sel, ({DST}{N}){DST_MIN});".format( - DST=dst, N=size, DST_MIN=limit_min[dst] - ) - ) + print(f" return __clc_max(sel, ({dstn}){dst_min});") else: print(" return sel;") if mode == "_rtn": - if clspv: - print( - " {BOOL}{N} c = convert_{BOOL}{N}(y > x);".format( - BOOL=bool_type[dst], N=size - ) - ) - if sizeof_type[src] >= 4 and src in int_types: - print( - " c = c || convert_{BOOL}{N}(({SRC}{N}){SRC_MAX} == x);".format( - BOOL=bool_type[dst], N=size, SRC=src, SRC_MAX=limit_max[src] - ) - ) - print( - " {DST}{N} sel = select(r, nextafter(r, ({DST}{N})-INFINITY), c);".format( - DST=dst, N=size, BOOL=bool_type[dst], SRC=src - ) - ) - else: - print( - " {DST}{N} sel = select(r, nextafter(r, ({DST}{N})-INFINITY), convert_{BOOL}{N}(y > x));".format( - DST=dst, N=size, BOOL=bool_type[dst] - ) - ) + print(f" {booln} c = __clc_convert_{booln}(y > x);") + if clspv and sizeof_type[src] >= 4 and src in int_types: + print(f" c = c || __clc_convert_{booln}(({srcn}){src_max} == x);") + print( + f" {dstn} sel = __clc_select(r, __clc_nextafter(r, ({dstn})-INFINITY), c);" + ) if dst == "half" and src in int_types and sizeof_type[src] >= 2: dst_max = limit_max[dst] - # short is 16 bits signed, so the maximum value rounded to negative infinity is 0x1.ffcp+14 (0x1p+15 == 32768 > 0x7fff == 32767) + # short is 16 bits signed, so the maximum value rounded to + # negative infinity is 0x1.ffcp+14 (0x1p+15 == 32768 > 0x7fff + # == 32767) if src == "short": dst_max = "0x1.ffcp+14" - print( - " return min(sel, ({DST}{N}){DST_MAX});".format( - DST=dst, N=size, DST_MAX=dst_max - ) - ) + print(f" return __clc_min(sel, ({dstn}){dst_max});") else: print(" return sel;") @@ -588,8 +567,8 @@ def generate_float_conversion(src, dst, size, mode, sat): for dst in float_types: for size in vector_sizes: for mode in rounding_modes: - # Do not generate "_rte" conversion for clspv as they are - # handled natively - if clspv and mode == "_rte": + # Do not generate user-facing "_rte" conversions for clspv as + # they are handled natively + if clspv and not clc and mode == "_rte": continue generate_float_conversion(src, dst, size, mode, "") diff --git a/libcxx/docs/Hardening.rst b/libcxx/docs/Hardening.rst index d399b94d27c19..097ad4b73b126 100644 --- a/libcxx/docs/Hardening.rst +++ b/libcxx/docs/Hardening.rst @@ -248,14 +248,12 @@ Hardening assertion failure =========================== In production modes (``fast`` and ``extensive``), a hardening assertion failure -immediately ``_traps `` +immediately ``_traps `` the program. This is the safest approach that also minimizes the code size penalty as the failure handler maps to a single instruction. The downside is that the failure provides no additional details other than the stack trace (which might also be affected by optimizations). -TODO(hardening): describe ``__builtin_verbose_trap`` once we can use it. - In the ``debug`` mode, an assertion failure terminates the program in an unspecified manner and also outputs the associated error message to the error output. This is less secure and increases the size of the binary (among other diff --git a/libcxx/include/__config b/libcxx/include/__config index 1c6dd8f36c32f..ca6aade34107b 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1119,18 +1119,23 @@ typedef __char32_t char32_t; // Optional attributes - these are useful for a better QoI, but not required to be available +# define _LIBCPP_NOALIAS __attribute__((__malloc__)) +# define _LIBCPP_NODEBUG [[__gnu__::__nodebug__]] +# define _LIBCPP_NO_SANITIZE(...) __attribute__((__no_sanitize__(__VA_ARGS__))) +# define _LIBCPP_INIT_PRIORITY_MAX __attribute__((__init_priority__(100))) +# define _LIBCPP_ATTRIBUTE_FORMAT(archetype, format_string_index, first_format_arg_index) \ + __attribute__((__format__(archetype, format_string_index, first_format_arg_index))) +# define _LIBCPP_PACKED __attribute__((__packed__)) + +// Use a function like macro to imply that it must be followed by a semicolon +# define _LIBCPP_FALLTHROUGH() [[fallthrough]] + # if __has_attribute(__no_sanitize__) && !defined(_LIBCPP_COMPILER_GCC) # define _LIBCPP_NO_CFI __attribute__((__no_sanitize__("cfi"))) # else # define _LIBCPP_NO_CFI # endif -# if __has_attribute(__malloc__) -# define _LIBCPP_NOALIAS __attribute__((__malloc__)) -# else -# define _LIBCPP_NOALIAS -# endif - # if __has_attribute(__using_if_exists__) # define _LIBCPP_USING_IF_EXISTS __attribute__((__using_if_exists__)) # else @@ -1149,15 +1154,6 @@ typedef __char32_t char32_t; # define _LIBCPP_DIAGNOSE_WARNING(...) # endif -// Use a function like macro to imply that it must be followed by a semicolon -# if __has_cpp_attribute(fallthrough) -# define _LIBCPP_FALLTHROUGH() [[fallthrough]] -# elif __has_attribute(__fallthrough__) -# define _LIBCPP_FALLTHROUGH() __attribute__((__fallthrough__)) -# else -# define _LIBCPP_FALLTHROUGH() ((void)0) -# endif - # if __has_cpp_attribute(_Clang::__lifetimebound__) # define _LIBCPP_LIFETIMEBOUND [[_Clang::__lifetimebound__]] # else @@ -1170,8 +1166,6 @@ typedef __char32_t char32_t; # define _LIBCPP_NOESCAPE # endif -# define _LIBCPP_NODEBUG [[__gnu__::__nodebug__]] - # if __has_cpp_attribute(_Clang::__no_specializations__) # define _LIBCPP_NO_SPECIALIZATIONS \ [[_Clang::__no_specializations__("Users are not allowed to specialize this standard library entity")]] @@ -1191,33 +1185,6 @@ typedef __char32_t char32_t; # define _LIBCPP_PREFERRED_NAME(x) # endif -# if __has_attribute(__no_sanitize__) -# define _LIBCPP_NO_SANITIZE(...) __attribute__((__no_sanitize__(__VA_ARGS__))) -# else -# define _LIBCPP_NO_SANITIZE(...) -# endif - -# if __has_attribute(__init_priority__) -# define _LIBCPP_INIT_PRIORITY_MAX __attribute__((__init_priority__(100))) -# else -# define _LIBCPP_INIT_PRIORITY_MAX -# endif - -# if __has_attribute(__format__) -// The attribute uses 1-based indices for ordinary and static member functions. -// The attribute uses 2-based indices for non-static member functions. -# define _LIBCPP_ATTRIBUTE_FORMAT(archetype, format_string_index, first_format_arg_index) \ - __attribute__((__format__(archetype, format_string_index, first_format_arg_index))) -# else -# define _LIBCPP_ATTRIBUTE_FORMAT(archetype, format_string_index, first_format_arg_index) /* nothing */ -# endif - -# if __has_attribute(__packed__) -# define _LIBCPP_PACKED __attribute__((__packed__)) -# else -# define _LIBCPP_PACKED -# endif - # if defined(_LIBCPP_ABI_MICROSOFT) && __has_declspec_attribute(empty_bases) # define _LIBCPP_DECLSPEC_EMPTY_BASES __declspec(empty_bases) # else diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h index 0a751ba32954f..698ae209ae1f8 100644 --- a/libcxx/include/__filesystem/path.h +++ b/libcxx/include/__filesystem/path.h @@ -18,6 +18,7 @@ #include <__iterator/back_insert_iterator.h> #include <__iterator/iterator_traits.h> #include <__type_traits/decay.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_pointer.h> #include <__type_traits/remove_const.h> #include <__type_traits/remove_pointer.h> diff --git a/libcxx/include/__vector/vector_bool.h b/libcxx/include/__vector/vector_bool.h index 35f0f745e201c..714c86ae2bb96 100644 --- a/libcxx/include/__vector/vector_bool.h +++ b/libcxx/include/__vector/vector_bool.h @@ -18,6 +18,7 @@ #include <__bit_reference> #include <__config> #include <__functional/unary_function.h> +#include <__fwd/bit_reference.h> #include <__fwd/functional.h> #include <__fwd/vector.h> #include <__iterator/distance.h> diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp index c41246522fdeb..2dbad321b782e 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp @@ -215,9 +215,7 @@ TEST_CONSTEXPR_CXX20 bool test() { Test, TriviallyComparable>().operator()*>(); #endif - // TODO: Remove the `_LIBCPP_ENABLE_EXPERIMENTAL` check once we have the FTM guarded or views::join isn't - // experimental anymore -#if TEST_STD_VER >= 20 && (!defined(_LIBCPP_VERSION) || defined(_LIBCPP_ENABLE_EXPERIMENTAL)) +#if TEST_STD_VER >= 20 { std::vector> vec = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}; auto view = vec | std::views::join; diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp index 4ae049c3ec001..760ee231bb9b6 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp @@ -131,9 +131,7 @@ constexpr bool test() { }); }); - // TODO: Remove the `_LIBCPP_ENABLE_EXPERIMENTAL` check once we have the FTM guarded or views::join isn't - // experimental anymore -#if TEST_STD_VER >= 20 && (!defined(_LIBCPP_VERSION) || defined(_LIBCPP_ENABLE_EXPERIMENTAL)) +#if TEST_STD_VER >= 20 { std::vector> vec = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}; auto view = vec | std::views::join; diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each.pass.cpp index 94a1ec6a1d294..3db0bde75abd7 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each.pass.cpp @@ -56,9 +56,7 @@ struct Test { TEST_CONSTEXPR_CXX20 bool test() { types::for_each(types::cpp17_input_iterator_list(), Test()); - // TODO: Remove the `_LIBCPP_ENABLE_EXPERIMENTAL` check once we have the FTM guarded or views::join isn't - // experimental anymore -#if TEST_STD_VER >= 20 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) +#if TEST_STD_VER >= 20 { // Make sure that the segmented iterator optimization works during constant evaluation std::vector> vecs = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}; auto v = std::views::join(vecs); diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index f1ede6474eb9e..0ee5bf3e71f59 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -548,59 +548,15 @@ apple-configuration) # TODO: It would be better to run the tests against the fake-installed version of libc++ instead xcrun --sdk macosx ninja -vC "${BUILD_DIR}/${arch}" check-cxx check-cxxabi check-cxx-abilist ;; -apple-system-hardened) - clean - - arch="$(uname -m)" - version="$(sw_vers --productVersion)" - params="target_triple=${arch}-apple-macosx${version}" - params+=";hardening_mode=fast" - - # In the Apple system configuration, we build libc++ and libunwind separately. - step "Installing libc++ and libc++abi in Apple-system configuration" - ${CMAKE} \ - -S "${MONOREPO_ROOT}/runtimes" \ - -B "${BUILD_DIR}/cxx" \ - -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/cxx" \ - -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \ - -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \ - -DLIBCXX_CXX_ABI=libcxxabi \ - -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Apple.cmake" \ - -DLIBCXX_TEST_CONFIG="apple-libc++-system.cfg.in" \ - -DLIBCXXABI_TEST_CONFIG="apple-libc++abi-system.cfg.in" \ - -DLIBCXX_TEST_PARAMS="${params}" \ - -DLIBCXXABI_TEST_PARAMS="${params}" - - step "Installing libunwind in Apple-system configuration" - ${CMAKE} \ - -S "${MONOREPO_ROOT}/runtimes" \ - -B "${BUILD_DIR}/unwind" \ - -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/unwind" \ - -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \ - -DLLVM_ENABLE_RUNTIMES="libunwind" \ - -DLIBUNWIND_TEST_CONFIG="apple-libunwind-system.cfg.in" \ - -DLIBUNWIND_TEST_PARAMS="${params}" \ - -DCMAKE_INSTALL_NAME_DIR="/usr/lib/system" - - step "Running the libc++ tests" - ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxx - - step "Running the libc++abi tests" - ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxxabi - - step "Running the libunwind tests" - ${NINJA} -vC "${BUILD_DIR}/unwind" check-unwind -;; -apple-system) +apple-system|apple-system-hardened) clean arch="$(uname -m)" version="$(sw_vers --productVersion)" params="target_triple=${arch}-apple-macosx${version}" + if [[ "${BUILDER}" == *-hardened ]]; then + params+=";hardening_mode=fast" + fi # In the Apple system configuration, we build libc++ and libunwind separately. step "Installing libc++ and libc++abi in Apple-system configuration" diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index e9e531733abb5..d0f9d279cd0ea 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -1957,7 +1957,7 @@ def produce_docs(): @dataclass class Metadata: - headers: list[str] = None + headers: List[str] = None test_suite_guard: str = None libcxx_guard: str = None diff --git a/libcxxabi/src/cxa_default_handlers.cpp b/libcxxabi/src/cxa_default_handlers.cpp index b029982ea87c3..97a6765fde8e9 100644 --- a/libcxxabi/src/cxa_default_handlers.cpp +++ b/libcxxabi/src/cxa_default_handlers.cpp @@ -9,6 +9,7 @@ // new_handler. //===----------------------------------------------------------------------===// +#include // std::abort #include #include #include "abort_message.h" @@ -94,7 +95,7 @@ static void demangling_unexpected_handler() static constexpr std::terminate_handler default_terminate_handler = demangling_terminate_handler; static constexpr std::terminate_handler default_unexpected_handler = demangling_unexpected_handler; #else // !LIBCXXABI_SILENT_TERMINATE -static constexpr std::terminate_handler default_terminate_handler = ::abort; +static constexpr std::terminate_handler default_terminate_handler = std::abort; static constexpr std::terminate_handler default_unexpected_handler = std::terminate; #endif // !LIBCXXABI_SILENT_TERMINATE diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h index 9b97359d49cf9..bb912ab41d0fe 100644 --- a/lldb/include/lldb/API/SBTarget.h +++ b/lldb/include/lldb/API/SBTarget.h @@ -964,6 +964,7 @@ class LLDB_API SBTarget { friend class SBSection; friend class SBSourceManager; friend class SBSymbol; + friend class SBType; friend class SBTypeStaticField; friend class SBValue; friend class SBVariablesOptions; diff --git a/lldb/include/lldb/API/SBType.h b/lldb/include/lldb/API/SBType.h index 63ba91082d576..9ad3244686328 100644 --- a/lldb/include/lldb/API/SBType.h +++ b/lldb/include/lldb/API/SBType.h @@ -221,6 +221,13 @@ class SBType { lldb::SBType GetTemplateArgumentType(uint32_t idx); + /// Returns the value of the non-type template parameter at index \c idx. + /// If \c idx is out-of-bounds or the template parameter doesn't have + /// a value, returns an empty SBValue. + /// + /// This function will expand parameter packs. + lldb::SBValue GetTemplateArgumentValue(lldb::SBTarget target, uint32_t idx); + /// Return the TemplateArgumentKind of the template argument at index idx. /// Variadic argument packs are automatically expanded. lldb::TemplateArgumentKind GetTemplateArgumentKind(uint32_t idx); diff --git a/lldb/include/lldb/API/SBValue.h b/lldb/include/lldb/API/SBValue.h index 9090cece80f7c..46ef6daa95264 100644 --- a/lldb/include/lldb/API/SBValue.h +++ b/lldb/include/lldb/API/SBValue.h @@ -446,6 +446,7 @@ class LLDB_API SBValue { friend class SBModule; friend class SBTarget; friend class SBThread; + friend class SBType; friend class SBTypeStaticField; friend class SBTypeSummary; friend class SBValueList; diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h index 70f4c4216221c..d7751ca045bb2 100644 --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -143,10 +143,6 @@ class Debugger : public std::enable_shared_from_this, File &GetErrorFile() { return m_error_stream_sp->GetFile(); } - StreamFile &GetOutputStream() { return *m_output_stream_sp; } - - StreamFile &GetErrorStream() { return *m_error_stream_sp; } - repro::DataRecorder *GetInputRecorder(); Status SetInputString(const char *data); diff --git a/lldb/include/lldb/Core/UserSettingsController.h b/lldb/include/lldb/Core/UserSettingsController.h index 32da7e05f7040..29e892fdba45b 100644 --- a/lldb/include/lldb/Core/UserSettingsController.h +++ b/lldb/include/lldb/Core/UserSettingsController.h @@ -38,9 +38,7 @@ class Properties { virtual ~Properties(); - virtual lldb::OptionValuePropertiesSP GetValueProperties() const { - // This function is virtual in case subclasses want to lazily implement - // creating the properties. + lldb::OptionValuePropertiesSP GetValueProperties() const { return m_collection_sp; } diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp index 4e6b22492a0d1..bf19d2ff8333c 100644 --- a/lldb/source/API/SBDebugger.cpp +++ b/lldb/source/API/SBDebugger.cpp @@ -185,7 +185,7 @@ lldb::SBError SBDebugger::InitializeWithErrorHandling() { llvm::sys::DynamicLibrary dynlib = llvm::sys::DynamicLibrary::getPermanentLibrary(spec.GetPath().c_str()); if (dynlib.isValid()) { - typedef bool (*LLDBCommandPluginInit)(lldb::SBDebugger & debugger); + typedef bool (*LLDBCommandPluginInit)(lldb::SBDebugger debugger); lldb::SBDebugger debugger_sb(debugger_sp); // This calls the bool lldb::PluginInitialize(lldb::SBDebugger debugger) @@ -508,39 +508,31 @@ SBFile SBDebugger::GetInputFile() { FILE *SBDebugger::GetOutputFileHandle() { LLDB_INSTRUMENT_VA(this); - if (m_opaque_sp) { - StreamFile &stream_file = m_opaque_sp->GetOutputStream(); - return stream_file.GetFile().GetStream(); - } + if (m_opaque_sp) + return m_opaque_sp->GetOutputStreamSP()->GetFile().GetStream(); return nullptr; } SBFile SBDebugger::GetOutputFile() { LLDB_INSTRUMENT_VA(this); - if (m_opaque_sp) { - SBFile file(m_opaque_sp->GetOutputStream().GetFileSP()); - return file; - } + if (m_opaque_sp) + return SBFile(m_opaque_sp->GetOutputStreamSP()->GetFileSP()); return SBFile(); } FILE *SBDebugger::GetErrorFileHandle() { LLDB_INSTRUMENT_VA(this); - if (m_opaque_sp) { - StreamFile &stream_file = m_opaque_sp->GetErrorStream(); - return stream_file.GetFile().GetStream(); - } + if (m_opaque_sp) + return m_opaque_sp->GetErrorStreamSP()->GetFile().GetStream(); return nullptr; } SBFile SBDebugger::GetErrorFile() { LLDB_INSTRUMENT_VA(this); SBFile file; - if (m_opaque_sp) { - SBFile file(m_opaque_sp->GetErrorStream().GetFileSP()); - return file; - } + if (m_opaque_sp) + return SBFile(m_opaque_sp->GetErrorStreamSP()->GetFileSP()); return SBFile(); } @@ -581,8 +573,8 @@ void SBDebugger::HandleCommand(const char *command) { sb_interpreter.HandleCommand(command, result, false); - result.PutError(m_opaque_sp->GetErrorStream().GetFileSP()); - result.PutOutput(m_opaque_sp->GetOutputStream().GetFileSP()); + result.PutError(m_opaque_sp->GetErrorStreamSP()->GetFileSP()); + result.PutOutput(m_opaque_sp->GetOutputStreamSP()->GetFileSP()); if (!m_opaque_sp->GetAsyncExecution()) { SBProcess process(GetCommandInterpreter().GetProcess()); diff --git a/lldb/source/API/SBType.cpp b/lldb/source/API/SBType.cpp index 4cc16c64e4756..6401d32c85795 100644 --- a/lldb/source/API/SBType.cpp +++ b/lldb/source/API/SBType.cpp @@ -687,6 +687,42 @@ lldb::TemplateArgumentKind SBType::GetTemplateArgumentKind(uint32_t idx) { return eTemplateArgumentKindNull; } +lldb::SBValue SBType::GetTemplateArgumentValue(lldb::SBTarget target, + uint32_t idx) { + LLDB_INSTRUMENT_VA(this, target, idx); + + if (!IsValid()) + return {}; + + std::optional arg; + const bool expand_pack = true; + switch (GetTemplateArgumentKind(idx)) { + case eTemplateArgumentKindIntegral: + arg = m_opaque_sp->GetCompilerType(false).GetIntegralTemplateArgument( + idx, expand_pack); + break; + default: + break; + } + + if (!arg) + return {}; + + Scalar value{arg->value}; + DataExtractor data; + value.GetData(data); + + ExecutionContext exe_ctx; + auto target_sp = target.GetSP(); + if (!target_sp) + return {}; + + target_sp->CalculateExecutionContext(exe_ctx); + + return ValueObject::CreateValueObjectFromData("value", data, exe_ctx, + arg->type); +} + SBType SBType::FindDirectNestedType(const char *name) { LLDB_INSTRUMENT_VA(this, name); diff --git a/lldb/source/Commands/CommandObjectDisassemble.cpp b/lldb/source/Commands/CommandObjectDisassemble.cpp index 5b131fe86dedb..70e687e19ac6d 100644 --- a/lldb/source/Commands/CommandObjectDisassemble.cpp +++ b/lldb/source/Commands/CommandObjectDisassemble.cpp @@ -21,6 +21,7 @@ #include "lldb/Target/SectionLoadList.h" #include "lldb/Target/StackFrame.h" #include "lldb/Target/Target.h" +#include static constexpr unsigned default_disasm_byte_size = 32; static constexpr unsigned default_disasm_num_ins = 4; @@ -236,25 +237,31 @@ CommandObjectDisassemble::CommandObjectDisassemble( CommandObjectDisassemble::~CommandObjectDisassemble() = default; -llvm::Error CommandObjectDisassemble::CheckRangeSize(const AddressRange &range, - llvm::StringRef what) { +llvm::Expected> +CommandObjectDisassemble::CheckRangeSize(std::vector ranges, + llvm::StringRef what) { + addr_t total_range_size = 0; + for (const AddressRange &r : ranges) + total_range_size += r.GetByteSize(); + if (m_options.num_instructions > 0 || m_options.force || - range.GetByteSize() < GetDebugger().GetStopDisassemblyMaxSize()) - return llvm::Error::success(); + total_range_size < GetDebugger().GetStopDisassemblyMaxSize()) + return ranges; + StreamString msg; msg << "Not disassembling " << what << " because it is very large "; - range.Dump(&msg, &GetTarget(), Address::DumpStyleLoadAddress, - Address::DumpStyleFileAddress); + for (const AddressRange &r : ranges) + r.Dump(&msg, &GetTarget(), Address::DumpStyleLoadAddress, + Address::DumpStyleFileAddress); msg << ". To disassemble specify an instruction count limit, start/stop " "addresses or use the --force option."; - return llvm::createStringError(llvm::inconvertibleErrorCode(), - msg.GetString()); + return llvm::createStringError(msg.GetString()); } llvm::Expected> CommandObjectDisassemble::GetContainingAddressRanges() { std::vector ranges; - const auto &get_range = [&](Address addr) { + const auto &get_ranges = [&](Address addr) { ModuleSP module_sp(addr.GetModule()); SymbolContext sc; bool resolve_tail_call_address = true; @@ -262,9 +269,11 @@ CommandObjectDisassemble::GetContainingAddressRanges() { addr, eSymbolContextEverything, sc, resolve_tail_call_address); if (sc.function || sc.symbol) { AddressRange range; - sc.GetAddressRange(eSymbolContextFunction | eSymbolContextSymbol, 0, - false, range); - ranges.push_back(range); + for (uint32_t idx = 0; + sc.GetAddressRange(eSymbolContextFunction | eSymbolContextSymbol, + idx, false, range); + ++idx) + ranges.push_back(range); } }; @@ -273,14 +282,14 @@ CommandObjectDisassemble::GetContainingAddressRanges() { Address symbol_containing_address; if (target.ResolveLoadAddress(m_options.symbol_containing_addr, symbol_containing_address)) { - get_range(symbol_containing_address); + get_ranges(symbol_containing_address); } } else { for (lldb::ModuleSP module_sp : target.GetImages().Modules()) { Address file_address; if (module_sp->ResolveFileAddress(m_options.symbol_containing_addr, file_address)) { - get_range(file_address); + get_ranges(file_address); } } } @@ -292,9 +301,7 @@ CommandObjectDisassemble::GetContainingAddressRanges() { m_options.symbol_containing_addr); } - if (llvm::Error err = CheckRangeSize(ranges[0], "the function")) - return std::move(err); - return ranges; + return CheckRangeSize(std::move(ranges), "the function"); } llvm::Expected> @@ -304,29 +311,24 @@ CommandObjectDisassemble::GetCurrentFunctionRanges() { if (!frame) { if (process) { return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "Cannot disassemble around the current " - "function without the process being stopped.\n"); - } else { - return llvm::createStringError(llvm::inconvertibleErrorCode(), - "Cannot disassemble around the current " - "function without a selected frame: " - "no currently running process.\n"); + "Cannot disassemble around the current function without the process " + "being stopped.\n"); } + return llvm::createStringError( + "Cannot disassemble around the current function without a selected " + "frame: no currently running process.\n"); } - SymbolContext sc( - frame->GetSymbolContext(eSymbolContextFunction | eSymbolContextSymbol)); - AddressRange range; + SymbolContext sc = + frame->GetSymbolContext(eSymbolContextFunction | eSymbolContextSymbol); + std::vector ranges; if (sc.function) - range = sc.function->GetAddressRange(); - else if (sc.symbol && sc.symbol->ValueIsAddress()) { - range = {sc.symbol->GetAddress(), sc.symbol->GetByteSize()}; - } else - range = {frame->GetFrameCodeAddress(), default_disasm_byte_size}; - - if (llvm::Error err = CheckRangeSize(range, "the current function")) - return std::move(err); - return std::vector{range}; + ranges = sc.function->GetAddressRanges(); + else if (sc.symbol && sc.symbol->ValueIsAddress()) + ranges.emplace_back(sc.symbol->GetAddress(), sc.symbol->GetByteSize()); + else + ranges.emplace_back(frame->GetFrameCodeAddress(), default_disasm_byte_size); + + return CheckRangeSize(std::move(ranges), "the current function"); } llvm::Expected> @@ -372,19 +374,23 @@ CommandObjectDisassemble::GetNameRanges(CommandReturnObject &result) { std::vector ranges; llvm::Error range_errs = llvm::Error::success(); - AddressRange range; const uint32_t scope = eSymbolContextBlock | eSymbolContextFunction | eSymbolContextSymbol; const bool use_inline_block_range = true; for (SymbolContext sc : sc_list.SymbolContexts()) { + std::vector fn_ranges; + AddressRange range; for (uint32_t range_idx = 0; sc.GetAddressRange(scope, range_idx, use_inline_block_range, range); - ++range_idx) { - if (llvm::Error err = CheckRangeSize(range, "a range")) - range_errs = joinErrors(std::move(range_errs), std::move(err)); - else - ranges.push_back(range); - } + ++range_idx) + fn_ranges.push_back(std::move(range)); + + if (llvm::Expected> checked_ranges = + CheckRangeSize(std::move(fn_ranges), "a function")) + llvm::move(*checked_ranges, std::back_inserter(ranges)); + else + range_errs = + joinErrors(std::move(range_errs), checked_ranges.takeError()); } if (ranges.empty()) { if (range_errs) diff --git a/lldb/source/Commands/CommandObjectDisassemble.h b/lldb/source/Commands/CommandObjectDisassemble.h index f9cba1e5ae9cb..4fbcd72d1c042 100644 --- a/lldb/source/Commands/CommandObjectDisassemble.h +++ b/lldb/source/Commands/CommandObjectDisassemble.h @@ -100,7 +100,8 @@ class CommandObjectDisassemble : public CommandObjectParsed { llvm::Expected> GetPCRanges(); llvm::Expected> GetStartEndAddressRanges(); - llvm::Error CheckRangeSize(const AddressRange &range, llvm::StringRef what); + llvm::Expected> + CheckRangeSize(std::vector ranges, llvm::StringRef what); CommandOptions m_options; }; diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt index cf5f6ac9da489..82fb5f42f9f4b 100644 --- a/lldb/source/Core/CMakeLists.txt +++ b/lldb/source/Core/CMakeLists.txt @@ -18,7 +18,6 @@ endif() if (LLVM_BUILD_TELEMETRY) set(TELEMETRY_DEPS Telemetry) - add_definitions(-DLLDB_BUILD_TELEMETRY) endif() # TODO: Add property `NO_PLUGIN_DEPENDENCIES` to lldbCore diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 2df2aeb20aa26..18569e155b517 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -257,12 +257,11 @@ Status Debugger::SetPropertyValue(const ExecutionContext *exe_ctx, std::list errors; StreamString feedback_stream; if (!target_sp->LoadScriptingResources(errors, feedback_stream)) { - Stream &s = GetErrorStream(); - for (auto &error : errors) { - s.Printf("%s\n", error.AsCString()); - } + lldb::StreamSP s = GetAsyncErrorStream(); + for (auto &error : errors) + s->Printf("%s\n", error.AsCString()); if (feedback_stream.GetSize()) - s.PutCString(feedback_stream.GetString()); + s->PutCString(feedback_stream.GetString()); } } } diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp index acc84dbf016fb..9c6ca1e5f910c 100644 --- a/lldb/source/Core/DynamicLoader.cpp +++ b/lldb/source/Core/DynamicLoader.cpp @@ -263,7 +263,7 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress( module_sp = std::make_shared(module_spec); } else if (force_symbol_search && error.AsCString("") && error.AsCString("")[0] != '\0') { - target.GetDebugger().GetErrorStream() << error.AsCString(); + *target.GetDebugger().GetAsyncErrorStream() << error.AsCString(); } } @@ -328,19 +328,19 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress( } } else { if (force_symbol_search) { - Stream &s = target.GetDebugger().GetErrorStream(); - s.Printf("Unable to find file"); + lldb::StreamSP s = target.GetDebugger().GetAsyncErrorStream(); + s->Printf("Unable to find file"); if (!name.empty()) - s.Printf(" %s", name.str().c_str()); + s->Printf(" %s", name.str().c_str()); if (uuid.IsValid()) - s.Printf(" with UUID %s", uuid.GetAsString().c_str()); + s->Printf(" with UUID %s", uuid.GetAsString().c_str()); if (value != LLDB_INVALID_ADDRESS) { if (value_is_offset) - s.Printf(" with slide 0x%" PRIx64, value); + s->Printf(" with slide 0x%" PRIx64, value); else - s.Printf(" at address 0x%" PRIx64, value); + s->Printf(" at address 0x%" PRIx64, value); } - s.Printf("\n"); + s->Printf("\n"); } LLDB_LOGF(log, "Unable to find binary %s with UUID %s and load it at " diff --git a/lldb/source/Core/Telemetry.cpp b/lldb/source/Core/Telemetry.cpp index 99f5d43ccbaf0..0d0d7c1df3bb9 100644 --- a/lldb/source/Core/Telemetry.cpp +++ b/lldb/source/Core/Telemetry.cpp @@ -6,7 +6,9 @@ // //===----------------------------------------------------------------------===// -#ifdef LLDB_BUILD_TELEMETRY +#include "llvm/Config/llvm-config.h" + +#ifdef LLVM_BUILD_TELEMETRY #include "lldb/Core/Telemetry.h" #include "lldb/Core/Debugger.h" @@ -71,4 +73,4 @@ llvm::Error TelemetryManager::preDispatch(TelemetryInfo *entry) { } // namespace telemetry } // namespace lldb_private -#endif // LLDB_BUILD_TELEMETRY +#endif // LLVM_BUILD_TELEMETRY diff --git a/lldb/source/Core/UserSettingsController.cpp b/lldb/source/Core/UserSettingsController.cpp index b57c1b0eef9b4..5408d64b40647 100644 --- a/lldb/source/Core/UserSettingsController.cpp +++ b/lldb/source/Core/UserSettingsController.cpp @@ -40,64 +40,45 @@ Properties::~Properties() = default; lldb::OptionValueSP Properties::GetPropertyValue(const ExecutionContext *exe_ctx, llvm::StringRef path, Status &error) const { - OptionValuePropertiesSP properties_sp(GetValueProperties()); - if (properties_sp) - return properties_sp->GetSubValue(exe_ctx, path, error); - return lldb::OptionValueSP(); + return m_collection_sp->GetSubValue(exe_ctx, path, error); } Status Properties::SetPropertyValue(const ExecutionContext *exe_ctx, VarSetOperationType op, llvm::StringRef path, llvm::StringRef value) { - OptionValuePropertiesSP properties_sp(GetValueProperties()); - if (properties_sp) - return properties_sp->SetSubValue(exe_ctx, op, path, value); - return Status::FromErrorString("no properties"); + return m_collection_sp->SetSubValue(exe_ctx, op, path, value); } void Properties::DumpAllPropertyValues(const ExecutionContext *exe_ctx, Stream &strm, uint32_t dump_mask, bool is_json) { - OptionValuePropertiesSP properties_sp(GetValueProperties()); - if (!properties_sp) - return; - if (is_json) { - llvm::json::Value json = properties_sp->ToJSON(exe_ctx); + llvm::json::Value json = m_collection_sp->ToJSON(exe_ctx); strm.Printf("%s", llvm::formatv("{0:2}", json).str().c_str()); } else - properties_sp->DumpValue(exe_ctx, strm, dump_mask); + m_collection_sp->DumpValue(exe_ctx, strm, dump_mask); } void Properties::DumpAllDescriptions(CommandInterpreter &interpreter, Stream &strm) const { strm.PutCString("Top level variables:\n\n"); - OptionValuePropertiesSP properties_sp(GetValueProperties()); - if (properties_sp) - return properties_sp->DumpAllDescriptions(interpreter, strm); + return m_collection_sp->DumpAllDescriptions(interpreter, strm); } Status Properties::DumpPropertyValue(const ExecutionContext *exe_ctx, Stream &strm, llvm::StringRef property_path, uint32_t dump_mask, bool is_json) { - OptionValuePropertiesSP properties_sp(GetValueProperties()); - if (properties_sp) { - return properties_sp->DumpPropertyValue(exe_ctx, strm, property_path, + return m_collection_sp->DumpPropertyValue(exe_ctx, strm, property_path, dump_mask, is_json); - } - return Status::FromErrorString("empty property list"); } size_t Properties::Apropos(llvm::StringRef keyword, std::vector &matching_properties) const { - OptionValuePropertiesSP properties_sp(GetValueProperties()); - if (properties_sp) { - properties_sp->Apropos(keyword, matching_properties); - } + m_collection_sp->Apropos(keyword, matching_properties); return matching_properties.size(); } diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 559b8301c1010..8d10e5de01225 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -245,8 +245,8 @@ ScriptInterpreterIORedirect::ScriptInterpreterIORedirect( if (outfile_handle) ::setbuf(outfile_handle, nullptr); - result->SetImmediateOutputFile(debugger.GetOutputStream().GetFileSP()); - result->SetImmediateErrorFile(debugger.GetErrorStream().GetFileSP()); + result->SetImmediateOutputFile(debugger.GetOutputStreamSP()->GetFileSP()); + result->SetImmediateErrorFile(debugger.GetErrorStreamSP()->GetFileSP()); } } diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp index 76f2db086476f..a77155f6bf41e 100644 --- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp +++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp @@ -31,6 +31,7 @@ #include "lldb/Core/Address.h" #include "lldb/Core/Module.h" +#include "lldb/Symbol/Function.h" #include "lldb/Symbol/SymbolContext.h" #include "lldb/Target/ExecutionContext.h" #include "lldb/Target/Process.h" @@ -1806,10 +1807,13 @@ const char *DisassemblerLLVMC::SymbolLookup(uint64_t value, uint64_t *type_ptr, bool format_omitting_current_func_name = false; if (sym_ctx.symbol || sym_ctx.function) { AddressRange range; - if (sym_ctx.GetAddressRange(resolve_scope, 0, false, range) && - range.GetBaseAddress().IsValid() && - range.ContainsLoadAddress(value_so_addr, target)) { - format_omitting_current_func_name = true; + for (uint32_t idx = 0; + sym_ctx.GetAddressRange(resolve_scope, idx, false, range); + ++idx) { + if (range.ContainsLoadAddress(value_so_addr, target)) { + format_omitting_current_func_name = true; + break; + } } } diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp index b8941dae01078..cff44b588e26e 100644 --- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp +++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp @@ -738,9 +738,9 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule( } if (IsKernel() && m_uuid.IsValid()) { - Stream &s = target.GetDebugger().GetOutputStream(); - s.Printf("Kernel UUID: %s\n", m_uuid.GetAsString().c_str()); - s.Printf("Load Address: 0x%" PRIx64 "\n", m_load_address); + lldb::StreamSP s = target.GetDebugger().GetAsyncOutputStream(); + s->Printf("Kernel UUID: %s\n", m_uuid.GetAsString().c_str()); + s->Printf("Load Address: 0x%" PRIx64 "\n", m_load_address); // Start of a kernel debug session, we have the UUID of the kernel. // Go through the target's list of modules and if there are any kernel @@ -830,12 +830,12 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule( } if (IsKernel() && !m_module_sp) { - Stream &s = target.GetDebugger().GetErrorStream(); - s.Printf("WARNING: Unable to locate kernel binary on the debugger " - "system.\n"); + lldb::StreamSP s = target.GetDebugger().GetAsyncErrorStream(); + s->Printf("WARNING: Unable to locate kernel binary on the debugger " + "system.\n"); if (kernel_search_error.Fail() && kernel_search_error.AsCString("") && kernel_search_error.AsCString("")[0] != '\0') { - s << kernel_search_error.AsCString(); + *s << kernel_search_error.AsCString(); } } } @@ -974,22 +974,19 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule( bool is_loaded = IsLoaded(); if (is_loaded && m_module_sp && IsKernel()) { - Stream &s = target.GetDebugger().GetOutputStream(); + lldb::StreamSP s = target.GetDebugger().GetAsyncOutputStream(); ObjectFile *kernel_object_file = m_module_sp->GetObjectFile(); if (kernel_object_file) { addr_t file_address = kernel_object_file->GetBaseAddress().GetFileAddress(); if (m_load_address != LLDB_INVALID_ADDRESS && file_address != LLDB_INVALID_ADDRESS) { - s.Printf("Kernel slid 0x%" PRIx64 " in memory.\n", - m_load_address - file_address); + s->Printf("Kernel slid 0x%" PRIx64 " in memory.\n", + m_load_address - file_address); } } - { - s.Printf("Loaded kernel file %s\n", - m_module_sp->GetFileSpec().GetPath().c_str()); - } - s.Flush(); + s->Printf("Loaded kernel file %s\n", + m_module_sp->GetFileSpec().GetPath().c_str()); } // Notify the target about the module being added; @@ -1195,10 +1192,11 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { lldb::offset_t offset = 0; m_kext_summary_header.version = data.GetU32(&offset); if (m_kext_summary_header.version > 128) { - Stream &s = m_process->GetTarget().GetDebugger().GetOutputStream(); - s.Printf("WARNING: Unable to read kext summary header, got " - "improbable version number %u\n", - m_kext_summary_header.version); + lldb::StreamSP s = + m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + s->Printf("WARNING: Unable to read kext summary header, got " + "improbable version number %u\n", + m_kext_summary_header.version); // If we get an improbably large version number, we're probably // getting bad memory. m_kext_summary_header_addr.Clear(); @@ -1209,11 +1207,11 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { if (m_kext_summary_header.entry_size > 4096) { // If we get an improbably large entry_size, we're probably // getting bad memory. - Stream &s = - m_process->GetTarget().GetDebugger().GetOutputStream(); - s.Printf("WARNING: Unable to read kext summary header, got " - "improbable entry_size %u\n", - m_kext_summary_header.entry_size); + lldb::StreamSP s = + m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + s->Printf("WARNING: Unable to read kext summary header, got " + "improbable entry_size %u\n", + m_kext_summary_header.entry_size); m_kext_summary_header_addr.Clear(); return false; } @@ -1227,10 +1225,11 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { if (m_kext_summary_header.entry_count > 10000) { // If we get an improbably large number of kexts, we're probably // getting bad memory. - Stream &s = m_process->GetTarget().GetDebugger().GetOutputStream(); - s.Printf("WARNING: Unable to read kext summary header, got " - "improbable number of kexts %u\n", - m_kext_summary_header.entry_count); + lldb::StreamSP s = + m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + s->Printf("WARNING: Unable to read kext summary header, got " + "improbable number of kexts %u\n", + m_kext_summary_header.entry_count); m_kext_summary_header_addr.Clear(); return false; } @@ -1331,17 +1330,18 @@ bool DynamicLoaderDarwinKernel::ParseKextSummaries( number_of_old_kexts_being_removed == 0) return true; - Stream &s = m_process->GetTarget().GetDebugger().GetOutputStream(); + lldb::StreamSP s = m_process->GetTarget().GetDebugger().GetOutputStreamSP(); if (load_kexts) { if (number_of_new_kexts_being_added > 0 && number_of_old_kexts_being_removed > 0) { - s.Printf("Loading %d kext modules and unloading %d kext modules ", - number_of_new_kexts_being_added, - number_of_old_kexts_being_removed); + s->Printf("Loading %d kext modules and unloading %d kext modules ", + number_of_new_kexts_being_added, + number_of_old_kexts_being_removed); } else if (number_of_new_kexts_being_added > 0) { - s.Printf("Loading %d kext modules ", number_of_new_kexts_being_added); + s->Printf("Loading %d kext modules ", number_of_new_kexts_being_added); } else if (number_of_old_kexts_being_removed > 0) { - s.Printf("Unloading %d kext modules ", number_of_old_kexts_being_removed); + s->Printf("Unloading %d kext modules ", + number_of_old_kexts_being_removed); } } @@ -1405,7 +1405,7 @@ bool DynamicLoaderDarwinKernel::ParseKextSummaries( if (image_info.GetModule()) { unloaded_module_list.AppendIfNeeded(image_info.GetModule()); } - s.Printf("."); + s->Printf("."); image_info.Clear(); // should pull it out of the KextImageInfos vector but that would // mutate the list and invalidate the to_be_removed bool vector; @@ -1417,11 +1417,11 @@ bool DynamicLoaderDarwinKernel::ParseKextSummaries( } if (load_kexts) { - s.Printf(" done.\n"); + s->Printf(" done.\n"); if (kexts_failed_to_load.size() > 0 && number_of_new_kexts_being_added > 0) { - s.Printf("Failed to load %d of %d kexts:\n", - (int)kexts_failed_to_load.size(), - number_of_new_kexts_being_added); + s->Printf("Failed to load %d of %d kexts:\n", + (int)kexts_failed_to_load.size(), + number_of_new_kexts_being_added); // print a sorted list of kexts which failed to load unsigned longest_name = 0; std::sort(kexts_failed_to_load.begin(), kexts_failed_to_load.end()); @@ -1433,10 +1433,9 @@ bool DynamicLoaderDarwinKernel::ParseKextSummaries( std::string uuid; if (ku.second.IsValid()) uuid = ku.second.GetAsString(); - s.Printf(" %-*s %s\n", longest_name, ku.first.c_str(), uuid.c_str()); + s->Printf(" %-*s %s\n", longest_name, ku.first.c_str(), uuid.c_str()); } } - s.Flush(); } return true; diff --git a/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp b/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp index 8391467c375f4..3bf0a46de57af 100644 --- a/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp +++ b/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp @@ -327,9 +327,9 @@ bool DynamicLoaderFreeBSDKernel::KModImageInfo::LoadImageUsingMemoryModule( Target &target = process->GetTarget(); if (IsKernel() && m_uuid.IsValid()) { - Stream &s = target.GetDebugger().GetOutputStream(); - s.Printf("Kernel UUID: %s\n", m_uuid.GetAsString().c_str()); - s.Printf("Load Address: 0x%" PRIx64 "\n", m_load_address); + lldb::StreamSP s = target.GetDebugger().GetAsyncOutputStream(); + s->Printf("Kernel UUID: %s\n", m_uuid.GetAsString().c_str()); + s->Printf("Load Address: 0x%" PRIx64 "\n", m_load_address); } // Test if the module is loaded into the taget, @@ -355,9 +355,9 @@ bool DynamicLoaderFreeBSDKernel::KModImageInfo::LoadImageUsingMemoryModule( if (!m_module_sp) m_module_sp = target.GetOrCreateModule(module_spec, true); if (IsKernel() && !m_module_sp) { - Stream &s = target.GetDebugger().GetOutputStream(); - s.Printf("WARNING: Unable to locate kernel binary on the debugger " - "system.\n"); + lldb::StreamSP s = target.GetDebugger().GetAsyncOutputStream(); + s->Printf("WARNING: Unable to locate kernel binary on the debugger " + "system.\n"); } } @@ -464,20 +464,19 @@ bool DynamicLoaderFreeBSDKernel::KModImageInfo::LoadImageUsingMemoryModule( } if (IsLoaded() && m_module_sp && IsKernel()) { - Stream &s = target.GetDebugger().GetOutputStream(); + lldb::StreamSP s = target.GetDebugger().GetAsyncOutputStream(); ObjectFile *kernel_object_file = m_module_sp->GetObjectFile(); if (kernel_object_file) { addr_t file_address = kernel_object_file->GetBaseAddress().GetFileAddress(); if (m_load_address != LLDB_INVALID_ADDRESS && file_address != LLDB_INVALID_ADDRESS) { - s.Printf("Kernel slide 0x%" PRIx64 " in memory.\n", - m_load_address - file_address); - s.Printf("Loaded kernel file %s\n", - m_module_sp->GetFileSpec().GetPath().c_str()); + s->Printf("Kernel slide 0x%" PRIx64 " in memory.\n", + m_load_address - file_address); + s->Printf("Loaded kernel file %s\n", + m_module_sp->GetFileSpec().GetPath().c_str()); } } - s.Flush(); } return IsLoaded(); diff --git a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp index 70e36801c3fd7..498da3ffe5a4a 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp @@ -864,13 +864,14 @@ bool InstrumentationRuntimeTSan::NotifyBreakpointHit( CreateStopReasonWithInstrumentationData( *thread_sp, stop_reason_description, report)); - StreamFile &s = process_sp->GetTarget().GetDebugger().GetOutputStream(); - s.Printf("ThreadSanitizer report breakpoint hit. Use 'thread " - "info -s' to get extended information about the " - "report.\n"); + lldb::StreamSP s = + process_sp->GetTarget().GetDebugger().GetAsyncOutputStream(); + s->Printf("ThreadSanitizer report breakpoint hit. Use 'thread " + "info -s' to get extended information about the " + "report.\n"); return true; // Return true to stop the target - } else + } return false; // Let target run } diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTrampolineHandler.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTrampolineHandler.cpp index 2b8adeae10d14..7774eb843c62d 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTrampolineHandler.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTrampolineHandler.cpp @@ -622,14 +622,14 @@ AppleObjCTrampolineHandler::AppleObjCTrampolineHandler( // step through any method dispatches. Warn to that effect and get out of // here. if (process_sp->CanJIT()) { - process_sp->GetTarget().GetDebugger().GetErrorStream().Printf( + process_sp->GetTarget().GetDebugger().GetAsyncErrorStream()->Printf( "Could not find implementation lookup function \"%s\"" " step in through ObjC method dispatch will not work.\n", get_impl_name.AsCString()); } return; } - + // We will either set the implementation to the _stret or non_stret version, // so either way it's safe to start filling the m_lookup_..._code here. m_lookup_implementation_function_code.assign( diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp index 896fc6951b85c..7e8eee9f5aa4f 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp @@ -289,7 +289,7 @@ bool ScriptInterpreterLua::BreakpointCallbackFunction( llvm::Expected BoolOrErr = lua.CallBreakpointCallback( baton, stop_frame_sp, bp_loc_sp, bp_option_data->m_extra_args_sp); if (llvm::Error E = BoolOrErr.takeError()) { - debugger.GetErrorStream() << toString(std::move(E)); + *debugger.GetAsyncErrorStream() << toString(std::move(E)); return true; } @@ -316,7 +316,7 @@ bool ScriptInterpreterLua::WatchpointCallbackFunction( llvm::Expected BoolOrErr = lua.CallWatchpointCallback(baton, stop_frame_sp, wp_sp); if (llvm::Error E = BoolOrErr.takeError()) { - debugger.GetErrorStream() << toString(std::move(E)); + *debugger.GetAsyncErrorStream() << toString(std::move(E)); return true; } diff --git a/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp b/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp index 7aeee6e403954..d0c3df05e6320 100644 --- a/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/None/ScriptInterpreterNone.cpp @@ -33,12 +33,12 @@ static const char *no_interpreter_err_msg = bool ScriptInterpreterNone::ExecuteOneLine(llvm::StringRef command, CommandReturnObject *, const ExecuteScriptOptions &) { - m_debugger.GetErrorStream().PutCString(no_interpreter_err_msg); + m_debugger.GetAsyncErrorStream()->PutCString(no_interpreter_err_msg); return false; } void ScriptInterpreterNone::ExecuteInterpreterLoop() { - m_debugger.GetErrorStream().PutCString(no_interpreter_err_msg); + m_debugger.GetAsyncErrorStream()->PutCString(no_interpreter_err_msg); } void ScriptInterpreterNone::Initialize() { diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index f4efb00161f8b..9ea5b95a3d803 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -1910,10 +1910,10 @@ bool ScriptInterpreterPythonImpl::BreakpointCallbackFunction( llvm::handleAllErrors( maybe_ret_val.takeError(), [&](PythonException &E) { - debugger.GetErrorStream() << E.ReadBacktrace(); + *debugger.GetAsyncErrorStream() << E.ReadBacktrace(); }, [&](const llvm::ErrorInfoBase &E) { - debugger.GetErrorStream() << E.message(); + *debugger.GetAsyncErrorStream() << E.message(); }); } else { diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp index 1e2564cb22f25..0db230d0a8b56 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp @@ -622,12 +622,12 @@ std::optional DWARFDIE::getLanguage() const { } DWARFDIE DWARFDIE::resolveReferencedType(dw_attr_t attr) const { - return GetReferencedDIE(attr); + return GetReferencedDIE(attr).resolveTypeUnitReference(); } DWARFDIE DWARFDIE::resolveReferencedType(DWARFFormValue v) const { if (IsValid()) - return v.Reference(); + return v.Reference().resolveTypeUnitReference(); return {}; } diff --git a/lldb/source/Symbol/Block.cpp b/lldb/source/Symbol/Block.cpp index 139fa06d08fca..6ecc988d7a5a9 100644 --- a/lldb/source/Symbol/Block.cpp +++ b/lldb/source/Symbol/Block.cpp @@ -243,25 +243,17 @@ bool Block::GetRangeContainingAddress(const Address &addr, AddressRange &range) { Function *function = CalculateSymbolContextFunction(); if (function) { - const AddressRange &func_range = function->GetAddressRange(); - if (addr.GetModule() == func_range.GetBaseAddress().GetModule()) { - const addr_t file_addr = addr.GetFileAddress(); - const addr_t func_file_addr = - func_range.GetBaseAddress().GetFileAddress(); - if (file_addr >= func_file_addr && - file_addr < func_file_addr + func_range.GetByteSize()) { - addr_t offset = file_addr - func_file_addr; - - const Range *range_ptr = m_ranges.FindEntryThatContains(offset); - - if (range_ptr) { - range.GetBaseAddress() = - Address(func_file_addr + range_ptr->GetRangeBase(), - addr.GetModule()->GetSectionList()); - range.SetByteSize(range_ptr->GetByteSize()); - return true; - } - } + if (uint32_t idx = GetRangeIndexContainingAddress(addr); + idx != UINT32_MAX) { + const Range *range_ptr = m_ranges.GetEntryAtIndex(idx); + assert(range_ptr); + + Address func_addr = function->GetAddress(); + range.GetBaseAddress() = + Address(func_addr.GetFileAddress() + range_ptr->GetRangeBase(), + func_addr.GetModule()->GetSectionList()); + range.SetByteSize(range_ptr->GetByteSize()); + return true; } } range.Clear(); @@ -278,19 +270,16 @@ bool Block::GetRangeContainingLoadAddress(lldb::addr_t load_addr, uint32_t Block::GetRangeIndexContainingAddress(const Address &addr) { Function *function = CalculateSymbolContextFunction(); - if (function) { - const AddressRange &func_range = function->GetAddressRange(); - if (addr.GetSection() == func_range.GetBaseAddress().GetSection()) { - const addr_t addr_offset = addr.GetOffset(); - const addr_t func_offset = func_range.GetBaseAddress().GetOffset(); - if (addr_offset >= func_offset && - addr_offset < func_offset + func_range.GetByteSize()) { - addr_t offset = addr_offset - func_offset; - return m_ranges.FindEntryIndexThatContains(offset); - } - } - } - return UINT32_MAX; + if (!function) + return UINT32_MAX; + + const Address &func_addr = function->GetAddress(); + if (addr.GetModule() != func_addr.GetModule()) + return UINT32_MAX; + + const addr_t file_addr = addr.GetFileAddress(); + const addr_t func_file_addr = func_addr.GetFileAddress(); + return m_ranges.FindEntryIndexThatContains(file_addr - func_file_addr); } bool Block::GetRangeAtIndex(uint32_t range_idx, AddressRange &range) { diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp index 19f4f91e29d25..183947a694363 100644 --- a/lldb/source/Symbol/SymbolContext.cpp +++ b/lldb/source/Symbol/SymbolContext.cpp @@ -104,15 +104,19 @@ bool SymbolContext::DumpStopContext( if (addr_t file_addr = addr.GetFileAddress(); file_addr != LLDB_INVALID_ADDRESS) { - const addr_t function_offset = - file_addr - function->GetAddress().GetFileAddress(); + // Avoiding signed arithmetic due to UB in -INT_MAX. + const char sign = + file_addr >= function->GetAddress().GetFileAddress() ? '+' : '-'; + addr_t offset = file_addr - function->GetAddress().GetFileAddress(); + if (sign == '-') + offset = -offset; if (!show_function_name) { // Print +offset even if offset is 0 dumped_something = true; - s->Printf("+%" PRIu64 ">", function_offset); - } else if (function_offset) { + s->Format("{0}{1}>", sign, offset); + } else if (offset) { dumped_something = true; - s->Printf(" + %" PRIu64, function_offset); + s->Format(" {0} {1}", sign, offset); } } @@ -351,8 +355,8 @@ bool SymbolContext::GetAddressRange(uint32_t scope, uint32_t range_idx, } if ((scope & eSymbolContextFunction) && (function != nullptr)) { - if (range_idx == 0) { - range = function->GetAddressRange(); + if (range_idx < function->GetAddressRanges().size()) { + range = function->GetAddressRanges()[range_idx]; return true; } } diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 428f8519b72fd..0041c8f2b2db2 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -1664,7 +1664,7 @@ Process::CreateBreakpointSite(const BreakpointLocationSP &constituent, Address symbol_address = symbol->GetAddress(); load_addr = ResolveIndirectFunction(&symbol_address, error); if (!error.Success() && show_error) { - GetTarget().GetDebugger().GetErrorStream().Printf( + GetTarget().GetDebugger().GetAsyncErrorStream()->Printf( "warning: failed to resolve indirect function at 0x%" PRIx64 " for breakpoint %i.%i: %s\n", symbol->GetLoadAddress(&GetTarget()), @@ -1703,7 +1703,7 @@ Process::CreateBreakpointSite(const BreakpointLocationSP &constituent, } else { if (show_error || use_hardware) { // Report error for setting breakpoint... - GetTarget().GetDebugger().GetErrorStream().Printf( + GetTarget().GetDebugger().GetAsyncErrorStream()->Printf( "warning: failed to set breakpoint site at 0x%" PRIx64 " for breakpoint %i.%i: %s\n", load_addr, constituent->GetBreakpoint().GetID(), diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 8d77097477651..db289fe9c4b64 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -1532,15 +1532,15 @@ static void LoadScriptingResourceForModule(const ModuleSP &module_sp, if (module_sp && !module_sp->LoadScriptingResourceInTarget(target, error, feedback_stream)) { if (error.AsCString()) - target->GetDebugger().GetErrorStream().Printf( + target->GetDebugger().GetAsyncErrorStream()->Printf( "unable to load scripting data for module %s - error reported was " "%s\n", module_sp->GetFileSpec().GetFileNameStrippingExtension().GetCString(), error.AsCString()); } if (feedback_stream.GetSize()) - target->GetDebugger().GetErrorStream().Printf("%s\n", - feedback_stream.GetData()); + target->GetDebugger().GetAsyncErrorStream()->Printf( + "%s\n", feedback_stream.GetData()); } void Target::ClearModules(bool delete_locations) { diff --git a/lldb/source/Target/ThreadPlanStepOut.cpp b/lldb/source/Target/ThreadPlanStepOut.cpp index c0ea53e4a8cbb..a05c46db6b8ca 100644 --- a/lldb/source/Target/ThreadPlanStepOut.cpp +++ b/lldb/source/Target/ThreadPlanStepOut.cpp @@ -364,8 +364,11 @@ bool ThreadPlanStepOut::ShouldStop(Event *event_ptr) { } if (!done) { - StackID frame_zero_id = GetThread().GetStackFrameAtIndex(0)->GetStackID(); - done = !(frame_zero_id < m_step_out_to_id); + StopInfoSP stop_info_sp = GetPrivateStopInfo(); + if (stop_info_sp && stop_info_sp->GetStopReason() == eStopReasonBreakpoint) { + StackID frame_zero_id = GetThread().GetStackFrameAtIndex(0)->GetStackID(); + done = !(frame_zero_id < m_step_out_to_id); + } } // The normal step out computations think we are done, so all we need to do diff --git a/lldb/source/Target/ThreadPlanTracer.cpp b/lldb/source/Target/ThreadPlanTracer.cpp index 356ce379c2993..a119bf8589279 100644 --- a/lldb/source/Target/ThreadPlanTracer.cpp +++ b/lldb/source/Target/ThreadPlanTracer.cpp @@ -47,7 +47,7 @@ Stream *ThreadPlanTracer::GetLogStream() { else { TargetSP target_sp(GetThread().CalculateTarget()); if (target_sp) - return &(target_sp->GetDebugger().GetOutputStream()); + return target_sp->GetDebugger().GetOutputStreamSP().get(); } return nullptr; } diff --git a/lldb/test/API/functionalities/thread/finish-from-empty-func/Makefile b/lldb/test/API/functionalities/thread/finish-from-empty-func/Makefile new file mode 100644 index 0000000000000..10495940055b6 --- /dev/null +++ b/lldb/test/API/functionalities/thread/finish-from-empty-func/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py b/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py new file mode 100644 index 0000000000000..bf57070e336e7 --- /dev/null +++ b/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py @@ -0,0 +1,43 @@ +""" +Test finish out of an empty function (may be one-instruction long) +""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class FinishFromEmptyFunctionTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def test_finish_from_empty_function(self): + """Test that when stopped at a breakpoint in an empty function, finish leaves it correctly.""" + self.build() + exe = self.getBuildArtifact("a.out") + target, process, thread, _ = lldbutil.run_to_name_breakpoint( + self, "done", exe_name=exe + ) + if self.TraceOn(): + self.runCmd("bt") + + correct_stepped_out_line = line_number("main.c", "leaving main") + return_statement_line = line_number("main.c", "return 0") + safety_bp = target.BreakpointCreateByLocation( + lldb.SBFileSpec("main.c"), return_statement_line + ) + self.assertTrue(safety_bp.IsValid()) + + error = lldb.SBError() + thread.StepOut(error) + self.assertTrue(error.Success()) + + if self.TraceOn(): + self.runCmd("bt") + + frame = thread.GetSelectedFrame() + self.assertEqual( + frame.line_entry.GetLine(), + correct_stepped_out_line, + "Step-out lost control of execution, ran too far", + ) diff --git a/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c b/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c new file mode 100644 index 0000000000000..bc66a548a89df --- /dev/null +++ b/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c @@ -0,0 +1,8 @@ +#include +void done() {} +int main() { + puts("in main"); + done(); + puts("leaving main"); + return 0; +} diff --git a/lldb/test/API/lang/cpp/class-template-parameter-pack/TestTemplatePackArgs.py b/lldb/test/API/lang/cpp/class-template-parameter-pack/TestTemplatePackArgs.py index c571357ff6720..f2467cbea9439 100644 --- a/lldb/test/API/lang/cpp/class-template-parameter-pack/TestTemplatePackArgs.py +++ b/lldb/test/API/lang/cpp/class-template-parameter-pack/TestTemplatePackArgs.py @@ -11,7 +11,7 @@ class TemplatePackArgsTestCase(TestBase): def test_template_argument_pack(self): self.build() - (_, _, thread, _) = lldbutil.run_to_source_breakpoint( + (target, _, thread, _) = lldbutil.run_to_source_breakpoint( self, "breakpoint here", lldb.SBFileSpec("main.cpp"), exe_name="a.out" ) frame = thread.GetSelectedFrame() @@ -33,10 +33,25 @@ def test_template_argument_pack(self): self.assertEqual( only_pack.GetType().GetTemplateArgumentType(2).GetName(), "double" ) - # Access the C template parameter. + nested_template = only_pack.GetType().GetTemplateArgumentType(3) self.assertEqual(nested_template.GetName(), "D") self.assertEqual(nested_template.GetNumberOfTemplateArguments(), 3) self.assertEqual(nested_template.GetTemplateArgumentType(0).GetName(), "int") self.assertEqual(nested_template.GetTemplateArgumentType(1).GetName(), "int") self.assertEqual(nested_template.GetTemplateArgumentType(2).GetName(), "bool") + + my_c = frame.FindVariable("myC") + self.assertTrue(my_c.IsValid(), "make sure we find the myC variable") + + # Out of bounds index. + self.assertFalse(my_c.GetType().GetTemplateArgumentValue(target, 3)) + + # Out of bounds index. + template_param_value = my_c.GetType().GetTemplateArgumentValue(target, 1) + self.assertEqual(template_param_value.GetTypeName(), "int") + self.assertEqual(template_param_value.GetValueAsSigned(), 16) + + template_param_value = my_c.GetType().GetTemplateArgumentValue(target, 2) + self.assertEqual(template_param_value.GetTypeName(), "int") + self.assertEqual(template_param_value.GetValueAsSigned(), 32) diff --git a/lldb/test/API/lang/cpp/template-arguments/Makefile b/lldb/test/API/lang/cpp/template-arguments/Makefile index 99998b20bcb05..4f79c0a900c3a 100644 --- a/lldb/test/API/lang/cpp/template-arguments/Makefile +++ b/lldb/test/API/lang/cpp/template-arguments/Makefile @@ -1,3 +1,4 @@ CXX_SOURCES := main.cpp +CXXFLAGS_EXTRAS := -std=c++20 include Makefile.rules diff --git a/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py b/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py index 7b63a6cca8db4..f1b3d7a9806fd 100644 --- a/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py +++ b/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py @@ -8,7 +8,7 @@ class TestCase(TestBase): @no_debug_info_test def test(self): self.build() - self.dbg.CreateTarget(self.getBuildArtifact("a.out")) + target = self.dbg.CreateTarget(self.getBuildArtifact("a.out")) value = self.expect_expr("temp1", result_type="C") template_type = value.GetType() @@ -27,10 +27,42 @@ def test(self): self.assertEqual( template_type.GetTemplateArgumentType(1).GetName(), "unsigned int" ) - # FIXME: There is no way to get the actual value of the parameter. + + # Template parameter isn't a NTTP. + self.assertFalse(template_type.GetTemplateArgumentValue(target, 0)) + + # Template parameter index out-of-bounds. + self.assertFalse(template_type.GetTemplateArgumentValue(target, 2)) + + # Template parameter is a NTTP. + param_val = template_type.GetTemplateArgumentValue(target, 1) + self.assertEqual(param_val.GetTypeName(), "unsigned int") + self.assertEqual(param_val.GetValueAsUnsigned(), 2) # Try to get an invalid template argument. self.assertEqual( template_type.GetTemplateArgumentKind(2), lldb.eTemplateArgumentKindNull ) self.assertEqual(template_type.GetTemplateArgumentType(2).GetName(), "") + + value = self.expect_expr("temp2", result_type="Foo") + + # Can't get template parameter value with invalid target. + self.assertFalse(value.GetType().GetTemplateArgumentValue(lldb.SBTarget(), 1)) + + template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) + self.assertTrue(template_param_value) + self.assertEqual(template_param_value.GetTypeName(), "short") + self.assertEqual(template_param_value.GetValueAsSigned(), -2) + + value = self.expect_expr("temp3", result_type="Foo") + template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) + self.assertTrue(template_param_value) + self.assertEqual(template_param_value.GetTypeName(), "char") + self.assertEqual(chr(template_param_value.GetValueAsSigned()), "v") + + # FIXME: type should be Foo + # FIXME: double/float NTTP parameter values currently not supported. + value = self.expect_expr("temp4", result_type="Foo") + template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) + self.assertFalse(template_param_value) diff --git a/lldb/test/API/lang/cpp/template-arguments/main.cpp b/lldb/test/API/lang/cpp/template-arguments/main.cpp index 728bd400c2586..0c0eb97cbc858 100644 --- a/lldb/test/API/lang/cpp/template-arguments/main.cpp +++ b/lldb/test/API/lang/cpp/template-arguments/main.cpp @@ -5,4 +5,9 @@ struct C { C temp1; +template struct Foo {}; +Foo temp2; +Foo temp3; +Foo temp4; + int main() {} diff --git a/lldb/test/Shell/Commands/command-disassemble.s b/lldb/test/Shell/Commands/command-disassemble.s index 1625f80468eb1..eb84a9ce39d4a 100644 --- a/lldb/test/Shell/Commands/command-disassemble.s +++ b/lldb/test/Shell/Commands/command-disassemble.s @@ -82,20 +82,26 @@ # CHECK-NEXT: (lldb) disassemble --name case2 # CHECK-NEXT: command-disassemble.s.tmp`n1::case2: # CHECK-NEXT: command-disassemble.s.tmp[0x2044] <+0>: int $0x32 -# CHECK-NEXT: warning: Not disassembling a range because it is very large [0x0000000000002046-0x0000000000004046). To disassemble specify an instruction count limit, start/stop addresses or use the --force option. +# CHECK-NEXT: warning: Not disassembling a function because it is very large [0x0000000000002046-0x0000000000004046). To disassemble specify an instruction count limit, start/stop addresses or use the --force option. # CHECK-NEXT: (lldb) disassemble --name case3 -# CHECK-NEXT: error: Not disassembling a range because it is very large [0x0000000000004046-0x0000000000006046). To disassemble specify an instruction count limit, start/stop addresses or use the --force option. -# CHECK-NEXT: Not disassembling a range because it is very large [0x0000000000006046-0x0000000000008046). To disassemble specify an instruction count limit, start/stop addresses or use the --force option. +# CHECK-NEXT: error: Not disassembling a function because it is very large [0x0000000000006046-0x0000000000007046)[0x0000000000009046-0x000000000000a050). To disassemble specify an instruction count limit, start/stop addresses or use the --force option. +# CHECK-NEXT: Not disassembling a function because it is very large [0x0000000000004046-0x0000000000006046). To disassemble specify an instruction count limit, start/stop addresses or use the --force option. # CHECK-NEXT: (lldb) disassemble --name case3 --count 3 +# CHECK-NEXT: command-disassemble.s.tmp`n2::case3: +# CHECK-NEXT: command-disassemble.s.tmp[0x6046] <-12288>: int $0x2a +# CHECK-NEXT: command-disassemble.s.tmp[0x6048] <-12286>: int $0x2a +# CHECK-NEXT: command-disassemble.s.tmp[0x604a] <-12284>: int $0x2a +# CHECK-EMPTY: +# CHECK-NEXT: command-disassemble.s.tmp`n2::case3: +# CHECK-NEXT: command-disassemble.s.tmp[0x9046] <+0>: jmp 0x6046 ; <-12288> +## FIXME: This should resolve to `middle_of_case3` +# CHECK-NEXT: command-disassemble.s.tmp[0x904b] <+5>: jmp 0x7046 ; n2::case3 - 8192 +# CHECK-NEXT: command-disassemble.s.tmp[0x9050] <+10>: int $0x2a +# CHECK-EMPTY: # CHECK-NEXT: command-disassemble.s.tmp`n1::case3: # CHECK-NEXT: command-disassemble.s.tmp[0x4046] <+0>: int $0x2a # CHECK-NEXT: command-disassemble.s.tmp[0x4048] <+2>: int $0x2a # CHECK-NEXT: command-disassemble.s.tmp[0x404a] <+4>: int $0x2a -# CHECK-EMPTY: -# CHECK-NEXT: command-disassemble.s.tmp`n2::case3: -# CHECK-NEXT: command-disassemble.s.tmp[0x6046] <+0>: int $0x2a -# CHECK-NEXT: command-disassemble.s.tmp[0x6048] <+2>: int $0x2a -# CHECK-NEXT: command-disassemble.s.tmp[0x604a] <+4>: int $0x2a # CHECK-EMPTY: @@ -158,8 +164,99 @@ _ZN2n15case3Ev: .rept 0x1000 int $42 .endr + .size _ZN2n15case3Ev, .-_ZN2n15case3Ev -_ZN2n25case3Ev: +.L_ZN2n25case3Ev.__part.1: + .rept 0x800 + int $42 + .endr +.L_ZN2n25case3Ev.__part.1_end: + +middle_of_case3: .rept 0x1000 int $42 .endr + +_ZN2n25case3Ev: + jmp .L_ZN2n25case3Ev.__part.1 + jmp middle_of_case3 + .rept 0x800 + int $42 + .endr +.L_ZN2n25case3Ev_end: + + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 8 # DW_FORM_string + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 85 # DW_AT_ranges + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 57 # DW_TAG_namespace + .byte 1 # DW_CHILDREN_yes + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 85 # DW_AT_ranges + .byte 23 # DW_FORM_sec_offset + .byte 3 # DW_AT_name + .byte 8 # DW_FORM_string + .byte 110 # DW_AT_linkage_name + .byte 8 # DW_FORM_string + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 1 # Abbrev DW_TAG_compile_unit + .asciz "Hand-written DWARF" # DW_AT_producer + .short 29 # DW_AT_language + .quad 0 # DW_AT_low_pc + .long .Ldebug_ranges0 # DW_AT_ranges + .byte 2 # Abbrev DW_TAG_namespace + .asciz "n2" # DW_AT_name + .byte 3 # Abbrev DW_TAG_subprogram + .long .Ldebug_ranges0 # DW_AT_ranges + .asciz "case3" # DW_AT_name + .asciz "_ZN2n25case3Ev" # DW_AT_linkage_name + .byte 0 # End Of Children Mark + .byte 0 # End Of Children Mark +.Ldebug_info_end0: + + .section .debug_rnglists,"",@progbits + .long .Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length +.Ldebug_list_header_start0: + .short 5 # Version + .byte 8 # Address size + .byte 0 # Segment selector size + .long 2 # Offset entry count +.Lrnglists_table_base0: + .long .Ldebug_ranges0-.Lrnglists_table_base0 +.Ldebug_ranges0: + .byte 6 # DW_RLE_start_end + .quad _ZN2n25case3Ev + .quad .L_ZN2n25case3Ev_end + .byte 6 # DW_RLE_start_end + .quad .L_ZN2n25case3Ev.__part.1 + .quad .L_ZN2n25case3Ev.__part.1_end + .byte 0 # DW_RLE_end_of_list +.Ldebug_list_header_end0: diff --git a/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s b/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s index 2e2bc52cd3ff9..1b15561c54283 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s +++ b/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s @@ -6,15 +6,24 @@ # CHECK: Found 1 function(s). # CHECK: foo: [input.o[0x0-0xe), input.o[0x14-0x1c)] -# CHECK-NEXT: input.o[0x0]: cmpl $0x0, %edi -# CHECK-NEXT: input.o[0x3]: je 0x14 -# CHECK-NEXT: input.o[0x5]: jmp 0x7 -# CHECK-NEXT: input.o[0x7]: callq 0xe -# CHECK-NEXT: input.o[0xc]: jmp 0x1b +# CHECK-NEXT: input.o[0x0]: callq 0xe +# CHECK-NEXT: input.o[0x5]: jmp 0x1b +# CHECK-NEXT: input.o[0x7]: cmpl $0x0, %edi +# CHECK-NEXT: input.o[0xa]: je 0x14 +# CHECK-NEXT: input.o[0xc]: jmp 0x0 # CHECK-EMPTY: # CHECK-NEXT: input.o[0x14]: callq 0x19 # CHECK-NEXT: input.o[0x19]: jmp 0x1b # CHECK-NEXT: input.o[0x1b]: retq +## Testing the GetRangeIndexForBlockAddress API. "ffffffff" indicates that +## the address does not belong to any range. +# CHECK-NEXT: offset 0x00 => index 0 +# CHECK-NEXT: offset 0x0c => index 0 +# CHECK-NEXT: offset 0x0e => index ffffffff +# CHECK-NEXT: offset 0x13 => index ffffffff +# CHECK-NEXT: offset 0x14 => index 1 +# CHECK-NEXT: offset 0x1b => index 1 +# CHECK-NEXT: offset 0x1c => index ffffffff #--- script.py @@ -28,6 +37,10 @@ def __lldb_init_module(debugger, internal_dict): fn = ctx.function print(f"{fn.name}: {fn.GetRanges()}") print(fn.GetInstructions(target)) + text = fn.addr.section + for offset in [0x00, 0x0c, 0x0e, 0x13, 0x14, 0x1b, 0x1c]: + idx = fn.block.GetRangeIndexForBlockAddress(lldb.SBAddress(text, offset)) + print(f"offset 0x{offset:02x} => index {idx:x}") #--- input.s # An example of a function which has been split into two parts. Roughly @@ -40,6 +53,14 @@ def __lldb_init_module(debugger, internal_dict): .text .type foo,@function +foo.__part.1: + .cfi_startproc + callq bar + jmp foo.__part.3 +.Lfoo.__part.1_end: + .size foo.__part.1, .Lfoo.__part.1_end-foo.__part.1 + .cfi_endproc + foo: .cfi_startproc cmpl $0, %edi @@ -49,14 +70,6 @@ foo: .Lfoo_end: .size foo, .Lfoo_end-foo -foo.__part.1: - .cfi_startproc - callq bar - jmp foo.__part.3 -.Lfoo.__part.1_end: - .size foo.__part.1, .Lfoo.__part.1_end-foo.__part.1 - .cfi_endproc - bar: .cfi_startproc movl $47, %eax diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp index 328d6d2e16d59..ad5dfb6a6dded 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/simplified-template-names.cpp @@ -11,12 +11,23 @@ // Test that we following DW_AT_signature correctly. If not, lldb might confuse the types of v1 and v2. // RUN: %clangxx --target=x86_64-pc-linux -g -gsimple-template-names -fdebug-types-section %s -c -o %t2.o // RUN: ld.lld %t2.o -o %t2 -// RUN: %lldb %t2 -o "target variable v1 v2" -o exit | FileCheck %s --check-prefix=TYPE +// RUN: %lldb %t2 -o "target variable v1 v2" \ +// RUN: -o "type lookup t2" -o "type lookup t2" \ +// RUN: -o exit | FileCheck %s --check-prefix=TYPE // LOG: unique name: t3 >::t4 -// TYPE: (t2 >) v1 = {} -// TYPE-NEXT: (t2 >) v2 = {} +// TYPE-LABEL: target variable v1 v2 +// TYPE: (t2 >) v1 = {} +// TYPE: (t2 >) v2 = {} + +// TYPE-LABEL: type lookup t2 +// TYPE: template<> struct t2 { +// TYPE-NEXT: } + +// TYPE-LABEL: type lookup t2 +// TYPE: template<> struct t2 { +// TYPE-NEXT: } struct outer_struct1 { template struct t1 {}; @@ -30,6 +41,9 @@ template struct t2 {}; t2> v1; t2> v2; +t2 v1_1; +t2 v1_2; + template struct t3 { struct t4 {}; }; diff --git a/lldb/tools/lldb-dap/package-lock.json b/lldb/tools/lldb-dap/package-lock.json index 4c18474241421..ab5c7dc33a8e5 100644 --- a/lldb/tools/lldb-dap/package-lock.json +++ b/lldb/tools/lldb-dap/package-lock.json @@ -1,15 +1,15 @@ { "name": "lldb-dap", - "version": "0.2.9", + "version": "0.2.10", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "lldb-dap", - "version": "0.2.9", + "version": "0.2.10", "license": "Apache 2.0 License with LLVM exceptions", "devDependencies": { - "@types/node": "^18.11.18", + "@types/node": "^18.19.41", "@types/vscode": "1.75.0", "@vscode/vsce": "^3.2.2", "prettier": "^3.4.2", @@ -389,10 +389,11 @@ } }, "node_modules/@types/node": { - "version": "18.19.6", - "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.6.tgz", - "integrity": "sha512-X36s5CXMrrJOs2lQCdDF68apW4Rfx9ixYMawlepwmE4Anezv/AV2LSpKD1Ub8DAc+urp5bk0BGZ6NtmBitfnsg==", + "version": "18.19.75", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.75.tgz", + "integrity": "sha512-UIksWtThob6ZVSyxcOqCLOUNg/dyO1Qvx4McgeuhrEtHTLFTf7BBhEazaE4K806FGTPtzd/2sE90qn4fVr7cyw==", "dev": true, + "license": "MIT", "dependencies": { "undici-types": "~5.26.4" } diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json index e866af0602d70..31d808eda4c35 100644 --- a/lldb/tools/lldb-dap/package.json +++ b/lldb/tools/lldb-dap/package.json @@ -1,7 +1,7 @@ { "name": "lldb-dap", "displayName": "LLDB DAP", - "version": "0.2.9", + "version": "0.2.10", "publisher": "llvm-vs-code-extensions", "homepage": "https://lldb.llvm.org", "description": "LLDB debugging from VSCode", @@ -27,7 +27,7 @@ "Debuggers" ], "devDependencies": { - "@types/node": "^18.11.18", + "@types/node": "^18.19.41", "@types/vscode": "1.75.0", "@vscode/vsce": "^3.2.2", "prettier-plugin-curly": "^0.3.1", @@ -86,7 +86,7 @@ "default": {}, "description": "The environment of the lldb-dap process.", "additionalProperties": { - "type": "string" + "type": "string" } } } @@ -152,6 +152,10 @@ "program" ], "properties": { + "debugAdapterExecutable": { + "type": "string", + "markdownDescription": "The absolute path to the LLDB debug adapter executable to use." + }, "program": { "type": "string", "description": "Path to the program to debug." @@ -338,6 +342,10 @@ }, "attach": { "properties": { + "debugAdapterExecutable": { + "type": "string", + "markdownDescription": "The absolute path to the LLDB debug adapter executable to use." + }, "program": { "type": "string", "description": "Path to the program to attach to." diff --git a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts index 55c2f3e9f7deb..36107336ebc4d 100644 --- a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts +++ b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts @@ -50,13 +50,13 @@ async function findDAPExecutable(): Promise { const executable = process.platform === "win32" ? "lldb-dap.exe" : "lldb-dap"; // Prefer lldb-dap from Xcode on Darwin. - const xcrun_dap = findWithXcrun(executable); + const xcrun_dap = await findWithXcrun(executable); if (xcrun_dap) { return xcrun_dap; } // Find lldb-dap in the user's path. - const path_dap = findInPath(executable); + const path_dap = await findInPath(executable); if (path_dap) { return path_dap; } @@ -67,12 +67,17 @@ async function findDAPExecutable(): Promise { async function getDAPExecutable( session: vscode.DebugSession, ): Promise { + // Check if the executable was provided in the launch configuration. + const launchConfigPath = session.configuration["debugAdapterExecutable"]; + if (typeof launchConfigPath === "string" && launchConfigPath.length !== 0) { + return launchConfigPath; + } + + // Check if the executable was provided in the extension's configuration. const config = vscode.workspace.getConfiguration( "lldb-dap", session.workspaceFolder, ); - - // Prefer the explicitly specified path in the extension's configuration. const configPath = config.get("executable-path"); if (configPath && configPath.length !== 0) { return configPath; diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 767774812ade5..c128fd2ed125c 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -319,6 +319,7 @@ check_symbol_exists(getrusage sys/resource.h HAVE_GETRUSAGE) check_symbol_exists(isatty unistd.h HAVE_ISATTY) check_symbol_exists(futimens sys/stat.h HAVE_FUTIMENS) check_symbol_exists(futimes sys/time.h HAVE_FUTIMES) +check_symbol_exists(getauxval sys/auxv.h HAVE_GETAUXVAL) # AddressSanitizer conflicts with lib/Support/Unix/Signals.inc # Avoid sigaltstack on Apple platforms, where backtrace() cannot handle it # (rdar://7089625) and _Unwind_Backtrace is unusable because it cannot unwind diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in index c49f10b9343ff..28655ee3ab87d 100644 --- a/llvm/cmake/modules/LLVMConfig.cmake.in +++ b/llvm/cmake/modules/LLVMConfig.cmake.in @@ -100,6 +100,8 @@ set(LLVM_ENABLE_PIC @LLVM_ENABLE_PIC@) set(LLVM_BUILD_32_BITS @LLVM_BUILD_32_BITS@) +set(LLVM_BUILD_TELEMETRY @LLVM_BUILD_TELEMETRY@) + if (NOT "@LLVM_PTHREAD_LIB@" STREQUAL "") set(LLVM_PTHREAD_LIB "@LLVM_PTHREAD_LIB@") endif() diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 84980d0c31d4f..899b2cf3b4901 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -18213,6 +18213,9 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_shared_vgpr_count`` 0 GFX10-GFX11 Controls SHARED_VGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table`. + ``.amdhsa_inst_pref_size`` 0 GFX11-GFX12 Controls INST_PREF_SIZE in + :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx10-gfx11-table` or + :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx12-table` ``.amdhsa_exception_fp_ieee_invalid_op`` 0 GFX6-GFX12 Controls ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION in :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. ``.amdhsa_exception_fp_denorm_src`` 0 GFX6-GFX12 Controls ENABLE_EXCEPTION_FP_DENORMAL_SOURCE in diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 2d72e548ec82a..b9f681f2feed8 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -1415,7 +1415,7 @@ Currently, only the following parameter attributes are defined: captured in certain locations. Currently only the return value (``ret``) and other (default) locations are supported. - The `pointer capture section ` discusses these semantics + The :ref:`pointer capture section ` discusses these semantics in more detail. Some examples of how to use the attribute: @@ -2046,8 +2046,8 @@ For example: This attribute specifies the possible memory effects of the call-site or function. It allows specifying the possible access kinds (``none``, ``read``, ``write``, or ``readwrite``) for the possible memory location - kinds (``argmem``, ``inaccessiblemem``, as well as a default). It is best - understood by example: + kinds (``argmem``, ``inaccessiblemem``, ``errnomem``, as well as a default). + It is best understood by example: - ``memory(none)``: Does not access any memory. - ``memory(read)``: May read (but not write) any memory. @@ -2056,6 +2056,8 @@ For example: - ``memory(argmem: read)``: May only read argument memory. - ``memory(argmem: read, inaccessiblemem: write)``: May only read argument memory and only write inaccessible memory. + - ``memory(argmem: read, errnomem: write)``: May only read argument memory + and only write errno. - ``memory(read, argmem: readwrite)``: May read any memory (default mode) and additionally write argument memory. - ``memory(readwrite, argmem: none)``: May access any memory apart from @@ -2085,6 +2087,7 @@ For example: allocator function may return newly accessible memory while only accessing inaccessible memory itself). Inaccessible memory is often used to model control dependencies of intrinsics. + - ``errnomem``: This refers to accesses to the ``errno`` variable. - The default access kind (specified without a location prefix) applies to all locations that haven't been specified explicitly, including those that don't currently have a dedicated location kind (e.g. accesses to globals @@ -20238,18 +20241,31 @@ Overview: """"""""" The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the -concatenation of the two vector operands down to the number of elements dictated -by the result type. The result type is a vector type that matches the type of the -first operand vector. +concatenation of the two vector arguments down to the number of elements of the +result vector type. Arguments: """""""""" -Both arguments must be vectors of matching element types. The first argument type must -match the result type, while the second argument type must have a vector length that is a -positive integer multiple of the first vector/result type. The arguments must be either be -both fixed or both scalable vectors. +The first argument is an integer vector with the same type as the result. +The second argument is a vector with a length that is a known integer multiple +of the result's type, while maintaining the same element type. + +Semantics: +"""""""""" + +Other than the reduction operator (e.g. add) the way in which the concatenated +arguments is reduced is entirely unspecified. By their nature these intrinsics +are not expected to be useful in isolation but instead implement the first phase +of an overall reduction operation. + +The typical use case is loop vectorization where reductions are split into an +in-loop phase, where maintaining an unordered vector result is important for +performance, and an out-of-loop phase to calculate the final scalar result. + +By avoiding the introduction of new ordering constraints, these intrinsics +enhance the ability to leverage a target's accumulation instructions. '``llvm.experimental.vector.histogram.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 1680b11433537..7eacc58549c7d 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -33,17 +33,12 @@ Marking Functions as Kernels In PTX, there are two types of functions: *device functions*, which are only callable by device code, and *kernel functions*, which are callable by host -code. By default, the back-end will emit device functions. Metadata is used to -declare a function as a kernel function. This metadata is attached to the -``nvvm.annotations`` named metadata object, and has the following format: +code. By default, the back-end will emit device functions. The ``ptx_kernel`` +calling convention is used to declare a function as a kernel function. -.. code-block:: text - - !0 = !{, metadata !"kernel", i32 1} - -The first parameter is a reference to the kernel function. The following -example shows a kernel function calling a device function in LLVM IR. The -function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is not. +The following example shows a kernel function calling a device function in LLVM +IR. The function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is +not. .. code-block:: llvm @@ -53,18 +48,32 @@ function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is not. ret float %add } - define void @my_kernel(ptr %ptr) { + define ptx_kernel void @my_kernel(ptr %ptr) { %val = load float, ptr %ptr %ret = call float @my_fmad(float %val, float %val, float %val) store float %ret, ptr %ptr ret void } - !nvvm.annotations = !{!1} - !1 = !{ptr @my_kernel, !"kernel", i32 1} - When compiled, the PTX kernel functions are callable by host-side code. +.. _nvptx_fnattrs: + +Function Attributes +------------------- + +``"nvvm.maxclusterrank"=""`` + This attribute specifies the maximum number of blocks per cluster. Must be + non-zero. Only supported for Hopper+. + +``"nvvm.minctasm"=""`` + This indicates a hint/directive to the compiler/driver, asking it to put at + least these many CTAs on an SM. + +``"nvvm.maxnreg"=""`` + This attribute indicates the maximum number of registers to be used for the + kernel function. + .. _address_spaces: diff --git a/llvm/docs/WritingAnLLVMBackend.rst b/llvm/docs/WritingAnLLVMBackend.rst index 1b9173b1fe139..3c5d594cc605e 100644 --- a/llvm/docs/WritingAnLLVMBackend.rst +++ b/llvm/docs/WritingAnLLVMBackend.rst @@ -954,8 +954,8 @@ Instruction Operand Name Mapping TableGen will also generate a function called getNamedOperandIdx() which can be used to look up an operand's index in a MachineInstr based on its TableGen name. Setting the UseNamedOperandTable bit in an instruction's -TableGen definition will add all of its operands to an enumeration in the -llvm::XXX:OpName namespace and also add an entry for it into the OperandMap +TableGen definition will add all of its operands to an enumeration +llvm::XXX:OpName and also add an entry for it into the OperandMap table, which can be queried using getNamedOperandIdx() .. code-block:: text @@ -978,20 +978,18 @@ XXXInstrInfo.cpp: .. code-block:: c++ - #define GET_INSTRINFO_NAMED_OPS // For getNamedOperandIdx() function + // For getNamedOperandIdx() function definition. + #define GET_INSTRINFO_NAMED_OPS #include "XXXGenInstrInfo.inc" XXXInstrInfo.h: .. code-block:: c++ - #define GET_INSTRINFO_OPERAND_ENUM // For OpName enum + // For OpName enum and getNamedOperandIdx declaration. + #define GET_INSTRINFO_OPERAND_ENUM #include "XXXGenInstrInfo.inc" - namespace XXX { - int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); - } // End namespace XXX - Instruction Operand Types ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h index 06a00d9ae7899..573df8833bd46 100644 --- a/llvm/include/llvm/Analysis/CaptureTracking.h +++ b/llvm/include/llvm/Analysis/CaptureTracking.h @@ -14,11 +14,13 @@ #define LLVM_ANALYSIS_CAPTURETRACKING_H #include "llvm/ADT/DenseMap.h" +#include "llvm/Support/ModRef.h" namespace llvm { class Value; class Use; + class CaptureInfo; class DataLayout; class Instruction; class DominatorTree; @@ -77,10 +79,47 @@ namespace llvm { const DominatorTree &DT, unsigned MaxUsesToExplore = 0); + /// Capture information for a specific Use. + struct UseCaptureInfo { + /// Components captured by this use. + CaptureComponents UseCC; + /// Components captured by the return value of the user of this Use. + CaptureComponents ResultCC; + + UseCaptureInfo(CaptureComponents UseCC, + CaptureComponents ResultCC = CaptureComponents::None) + : UseCC(UseCC), ResultCC(ResultCC) {} + + static UseCaptureInfo passthrough() { + return UseCaptureInfo(CaptureComponents::None, CaptureComponents::All); + } + + bool isPassthrough() const { + return capturesNothing(UseCC) && capturesAnything(ResultCC); + } + + operator CaptureComponents() const { return UseCC | ResultCC; } + }; + /// This callback is used in conjunction with PointerMayBeCaptured. In /// addition to the interface here, you'll need to provide your own getters /// to see whether anything was captured. struct CaptureTracker { + /// Action returned from captures(). + enum Action { + /// Stop the traversal. + Stop, + /// Continue traversal, and also follow the return value of the user if + /// it has additional capture components (that is, if it has capture + /// components in Ret that are not part of Other). + Continue, + /// Continue traversal, but do not follow the return value of the user, + /// even if it has additional capture components. Should only be used if + /// captures() has already taken the potential return captures into + /// account. + ContinueIgnoringReturn, + }; + virtual ~CaptureTracker(); /// tooManyUses - The depth of traversal has breached a limit. There may be @@ -94,10 +133,12 @@ namespace llvm { /// U->getUser() is always an Instruction. virtual bool shouldExplore(const Use *U); - /// captured - Information about the pointer was captured by the user of - /// use U. Return true to stop the traversal or false to continue looking - /// for more capturing instructions. - virtual bool captured(const Use *U) = 0; + /// Use U directly captures CI.UseCC and additionally CI.ResultCC + /// through the return value of the user of U. + /// + /// Return one of Stop, Continue or ContinueIgnoringReturn to control + /// further traversal. + virtual Action captured(const Use *U, UseCaptureInfo CI) = 0; /// isDereferenceableOrNull - Overload to allow clients with additional /// knowledge about pointer dereferenceability to provide it and thereby @@ -105,21 +146,18 @@ namespace llvm { virtual bool isDereferenceableOrNull(Value *O, const DataLayout &DL); }; - /// Types of use capture kinds, see \p DetermineUseCaptureKind. - enum class UseCaptureKind { - NO_CAPTURE, - MAY_CAPTURE, - PASSTHROUGH, - }; - /// Determine what kind of capture behaviour \p U may exhibit. /// - /// A use can be no-capture, a use can potentially capture, or a use can be - /// passthrough such that the uses of the user or \p U should be inspected. + /// The returned UseCaptureInfo contains the components captured directly + /// by the use (UseCC) and the components captured through the return value + /// of the user (ResultCC). + /// + /// \p Base is the starting value of the capture analysis, which is + /// relevant for address_is_null captures. /// The \p IsDereferenceableOrNull callback is used to rule out capturing for /// certain comparisons. - UseCaptureKind - DetermineUseCaptureKind(const Use &U, + UseCaptureInfo + DetermineUseCaptureKind(const Use &U, const Value *Base, llvm::function_ref IsDereferenceableOrNull); diff --git a/llvm/include/llvm/Analysis/DDG.h b/llvm/include/llvm/Analysis/DDG.h index be5f746e31a57..dfd84a9addb97 100644 --- a/llvm/include/llvm/Analysis/DDG.h +++ b/llvm/include/llvm/Analysis/DDG.h @@ -453,7 +453,7 @@ bool DependenceGraphInfo::getDependencies( for (auto *SrcI : SrcIList) for (auto *DstI : DstIList) if (auto Dep = - const_cast(&DI)->depends(SrcI, DstI, true)) + const_cast(&DI)->depends(SrcI, DstI)) Deps.push_back(std::move(Dep)); return !Deps.empty(); diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h index f0a09644e0f4b..426ac757b4b0d 100644 --- a/llvm/include/llvm/Analysis/DependenceAnalysis.h +++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h @@ -303,12 +303,8 @@ namespace llvm { /// depends - Tests for a dependence between the Src and Dst instructions. /// Returns NULL if no dependence; otherwise, returns a Dependence (or a /// FullDependence) with as much information as can be gleaned. - /// The flag PossiblyLoopIndependent should be set by the caller - /// if it appears that control flow can reach from Src to Dst - /// without traversing a loop back edge. std::unique_ptr depends(Instruction *Src, - Instruction *Dst, - bool PossiblyLoopIndependent); + Instruction *Dst); /// getSplitIteration - Give a dependence that's splittable at some /// particular level, return the iteration that should be used to split diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 6fc6ca14d0889..b675c4f875448 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -497,6 +497,7 @@ class RuntimePointerChecking { Pointers.clear(); Checks.clear(); DiffChecks.clear(); + CheckingGroups.clear(); } /// Insert a pointer and calculate the start and end SCEVs. diff --git a/llvm/include/llvm/Analysis/SparsePropagation.h b/llvm/include/llvm/Analysis/SparsePropagation.h index cc79870229873..b85e11942a320 100644 --- a/llvm/include/llvm/Analysis/SparsePropagation.h +++ b/llvm/include/llvm/Analysis/SparsePropagation.h @@ -244,13 +244,13 @@ SparseSolver::getValueState(LatticeKey Key) { template void SparseSolver::UpdateState(LatticeKey Key, LatticeVal LV) { - auto I = ValueState.find(Key); - if (I != ValueState.end() && I->second == LV) + auto [I, Inserted] = ValueState.try_emplace(Key); + if (!Inserted && I->second == LV) return; // No change. // Update the state of the given LatticeKey and add its corresponding LLVM // value to the work list. - ValueState[Key] = std::move(LV); + I->second = std::move(LV); if (Value *V = KeyInfo::getValueFromLatticeKey(Key)) ValueWorkList.push_back(V); } diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index 4a431383e0a1c..a53d471f70271 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -201,6 +201,7 @@ enum Kind { kw_readwrite, kw_argmem, kw_inaccessiblemem, + kw_errnomem, // Legacy attributes: kw_argmemonly, diff --git a/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h b/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h new file mode 100644 index 0000000000000..a5fe6776c685e --- /dev/null +++ b/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h @@ -0,0 +1,59 @@ +///===- DroppedVariableStatsMIR.h - Opt Diagnostics -*- C++ -*-------------===// +/// +/// Part of the LLVM Project, under the Apache License v2.0 with LLVM +/// Exceptions. See https://llvm.org/LICENSE.txt for license information. +/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +/// +///===---------------------------------------------------------------------===// +/// \file +/// Dropped Variable Statistics for Debug Information. Reports any number +/// of DBG_VALUEs that get dropped due to an optimization pass. +/// +///===---------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_DROPPEDVARIABLESTATSMIR_H +#define LLVM_CODEGEN_DROPPEDVARIABLESTATSMIR_H + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/DroppedVariableStats.h" + +namespace llvm { + +/// A class to collect and print dropped debug information due to MIR +/// optimization passes. After every MIR pass is run, it will print how many +/// #DBG_VALUEs were dropped due to that pass. +class DroppedVariableStatsMIR : public DroppedVariableStats { +public: + DroppedVariableStatsMIR() : DroppedVariableStats(false) {} + + void runBeforePass(StringRef PassID, MachineFunction *MF); + + void runAfterPass(StringRef PassID, MachineFunction *MF); + +private: + const MachineFunction *MFunc; + /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or + /// after a pass has run to facilitate dropped variable calculation for an + /// llvm::MachineFunction. + void runOnMachineFunction(const MachineFunction *MF, bool Before); + /// Iterate over all Instructions in a MachineFunction and report any dropped + /// debug information. + void calculateDroppedVarStatsOnMachineFunction(const MachineFunction *MF, + StringRef PassID, + StringRef FuncOrModName); + /// Override base class method to run on an llvm::MachineFunction + /// specifically. + virtual void + visitEveryInstruction(unsigned &DroppedCount, + DenseMap &InlinedAtsMap, + VarID Var) override; + /// Override base class method to run on DBG_VALUEs specifically. + virtual void visitEveryDebugRecord( + DenseSet &VarIDSet, + DenseMap> &InlinedAtsMap, + StringRef FuncName, bool Before) override; +}; + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index f95a02aad4559..0362b501ed347 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -468,9 +468,10 @@ class NodeSet { SUnit *SuccSUnit = Succ.getDst(); if (V != SuccSUnit) continue; - if (SUnitToDistance[U] + Succ.getLatency() > SUnitToDistance[V]) { - SUnitToDistance[V] = SUnitToDistance[U] + Succ.getLatency(); - } + unsigned &DU = SUnitToDistance[U]; + unsigned &DV = SUnitToDistance[V]; + if (DU + Succ.getLatency() > DV) + DV = DU + Succ.getLatency(); } } // Handle a back-edge in loop carried dependencies diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index 4762494e6ccb7..98cc05842847f 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -98,6 +98,12 @@ #include namespace llvm { +namespace impl_detail { +// FIXME: Remove these declarations once RegisterClassInfo is queryable as an +// analysis. +class MachineSchedulerImpl; +class PostMachineSchedulerImpl; +} // namespace impl_detail namespace MISched { enum Direction { @@ -1078,9 +1084,23 @@ class GenericSchedulerBase : public MachineSchedStrategy { /// Represent the type of SchedCandidate found within a single queue. /// pickNodeBidirectional depends on these listed by decreasing priority. enum CandReason : uint8_t { - NoCand, Only1, PhysReg, RegExcess, RegCritical, Stall, Cluster, Weak, - RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, - TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; + NoCand, + Only1, + PhysReg, + RegExcess, + RegCritical, + Stall, + Cluster, + Weak, + RegMax, + ResourceReduce, + ResourceDemand, + BotHeightReduce, + BotPathReduce, + TopDepthReduce, + TopPathReduce, + NodeOrder + }; #ifndef NDEBUG static const char *getReasonStr(GenericSchedulerBase::CandReason Reason); @@ -1385,6 +1405,34 @@ std::unique_ptr createCopyConstrainDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI); +class MachineSchedulerPass : public PassInfoMixin { + // FIXME: Remove this member once RegisterClassInfo is queryable as an + // analysis. + std::unique_ptr Impl; + const TargetMachine *TM; + +public: + MachineSchedulerPass(const TargetMachine *TM); + MachineSchedulerPass(MachineSchedulerPass &&Other); + ~MachineSchedulerPass(); + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +class PostMachineSchedulerPass + : public PassInfoMixin { + // FIXME: Remove this member once RegisterClassInfo is queryable as an + // analysis. + std::unique_ptr Impl; + const TargetMachine *TM; + +public: + PostMachineSchedulerPass(const TargetMachine *TM); + PostMachineSchedulerPass(PostMachineSchedulerPass &&Other); + ~PostMachineSchedulerPass(); + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; } // end namespace llvm #endif // LLVM_CODEGEN_MACHINESCHEDULER_H diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 8c1e2fa6f57a8..6eff6bfe8d5b1 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1645,7 +1645,7 @@ class ShuffleVectorSDNode : public SDNode { return Mask[Idx]; } - bool isSplat() const { return isSplatMask(Mask, getValueType(0)); } + bool isSplat() const { return isSplatMask(getMask()); } int getSplatIndex() const { assert(isSplat() && "Cannot get splat index for non-splat!"); @@ -1659,7 +1659,7 @@ class ShuffleVectorSDNode : public SDNode { return 0; } - static bool isSplatMask(const int *Mask, EVT VT); + static bool isSplatMask(ArrayRef Mask); /// Change values in a shuffle permute mask assuming /// the two vector operands have swapped position. diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index 97de0197da9b4..cdbefb36c00c7 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -229,10 +229,17 @@ class TargetFrameLowering { /// Returns true if we may need to fix the unwind information for the /// function. - virtual bool enableCFIFixup(MachineFunction &MF) const; + virtual bool enableCFIFixup(const MachineFunction &MF) const; + + /// enableFullCFIFixup - Returns true if we may need to fix the unwind + /// information such that it is accurate for *every* instruction in the + /// function (e.g. if the function has an async unwind table). + virtual bool enableFullCFIFixup(const MachineFunction &MF) const { + return enableCFIFixup(MF); + }; /// Emit CFI instructions that recreate the state of the unwind information - /// upon fucntion entry. + /// upon function entry. virtual void resetCFIToInitialState(MachineBasicBlock &MBB) const {} /// Replace a StackProbe stub (if any) with the actual probe code inline diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index f6f10ea4f4f83..835201f2a45b0 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -295,4 +295,6 @@ #cmakedefine HAVE_BUILTIN_THREAD_POINTER ${HAVE_BUILTIN_THREAD_POINTER} +#cmakedefine HAVE_GETAUXVAL ${HAVE_GETAUXVAL} + #endif diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake index 629977cc11d68..239f9dd3f38db 100644 --- a/llvm/include/llvm/Config/llvm-config.h.cmake +++ b/llvm/include/llvm/Config/llvm-config.h.cmake @@ -201,4 +201,7 @@ /* Define if logf128 is available */ #cmakedefine LLVM_HAS_LOGF128 +/* Define if building LLVM with LLVM_BUILD_TELEMETRY */ +#cmakedefine LLVM_BUILD_TELEMETRY ${LLVM_BUILD_TELEMETRY} + #endif diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h index db5ff135a7164..0f59edd429332 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h @@ -90,10 +90,6 @@ using SPSRunAsIntFunctionSignature = int32_t(shared::SPSExecutorAddr, int32_t); } // end namespace rt namespace rt_alt { -extern const char *UnwindInfoManagerInstanceName; -extern const char *UnwindInfoManagerFindSectionsHelperName; -extern const char *UnwindInfoManagerEnableWrapperName; -extern const char *UnwindInfoManagerDisableWrapperName; extern const char *UnwindInfoManagerRegisterActionName; extern const char *UnwindInfoManagerDeregisterActionName; } // end namespace rt_alt diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h index fc7719f282122..847c340eff17d 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h @@ -15,14 +15,13 @@ #define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_UNWINDINFOMANAGER_H #include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h" -#include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorBootstrapService.h" #include "llvm/Support/Error.h" #include #include namespace llvm::orc { -class UnwindInfoManager : public ExecutorBootstrapService { +class UnwindInfoManager { public: // This struct's layout should match the unw_dynamic_unwind_sections struct // from libunwind/src/libunwid_ext.h. @@ -34,43 +33,40 @@ class UnwindInfoManager : public ExecutorBootstrapService { size_t compact_unwind_section_length; }; + UnwindInfoManager(UnwindInfoManager &&) = delete; + UnwindInfoManager &operator=(UnwindInfoManager &&) = delete; + ~UnwindInfoManager(); + /// If the libunwind find-dynamic-unwind-info callback registration APIs are - /// available then this method will return an UnwindInfoManager instance, - /// otherwise it will return nullptr. - static std::unique_ptr TryCreate(); + /// available then this method will instantiate a global UnwindInfoManager + /// instance suitable for the process and return true. Otherwise it will + /// return false. + static bool TryEnable(); - Error shutdown() override; - void addBootstrapSymbols(StringMap &M) override; + static void addBootstrapSymbols(StringMap &M); - Error enable(void *FindDynamicUnwindSections); - Error disable(void); + static Error registerSections(ArrayRef CodeRanges, + orc::ExecutorAddr DSOBase, + orc::ExecutorAddrRange DWARFEHFrame, + orc::ExecutorAddrRange CompactUnwind); - Error registerSections(ArrayRef CodeRanges, - orc::ExecutorAddr DSOBase, - orc::ExecutorAddrRange DWARFEHFrame, - orc::ExecutorAddrRange CompactUnwind); + static Error deregisterSections(ArrayRef CodeRanges); - Error deregisterSections(ArrayRef CodeRanges); +private: + UnwindInfoManager() = default; - int findSections(uintptr_t Addr, UnwindSections *Info); + int findSectionsImpl(uintptr_t Addr, UnwindSections *Info); + static int findSections(uintptr_t Addr, UnwindSections *Info); -private: - UnwindInfoManager(int (*AddFindDynamicUnwindSections)(void *), - int (*RemoveFindDynamicUnwindSections)(void *)) - : AddFindDynamicUnwindSections(AddFindDynamicUnwindSections), - RemoveFindDynamicUnwindSections(RemoveFindDynamicUnwindSections) {} + Error registerSectionsImpl(ArrayRef CodeRanges, + orc::ExecutorAddr DSOBase, + orc::ExecutorAddrRange DWARFEHFrame, + orc::ExecutorAddrRange CompactUnwind); - static int findSectionsHelper(UnwindInfoManager *Instance, uintptr_t Addr, - UnwindSections *Info); + Error deregisterSectionsImpl(ArrayRef CodeRanges); std::mutex M; std::map UWSecs; - - int (*AddFindDynamicUnwindSections)(void *) = nullptr; - int (*RemoveFindDynamicUnwindSections)(void *) = nullptr; - void *FindDynamicUnwindSections = nullptr; - - static const char *AddFnName, *RemoveFnName; }; } // namespace llvm::orc diff --git a/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h index eb883a79a93d8..65f20ad3b2163 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h @@ -19,15 +19,17 @@ namespace llvm::orc { class UnwindInfoRegistrationPlugin : public LinkGraphLinkingLayer::Plugin { public: - static Expected> - Create(IRLayer &IRL, JITDylib &PlatformJD, ExecutorAddr Instance, - ExecutorAddr FindHelper, ExecutorAddr Enable, ExecutorAddr Disable, - ExecutorAddr Register, ExecutorAddr Deregister); + UnwindInfoRegistrationPlugin(ExecutionSession &ES, ExecutorAddr Register, + ExecutorAddr Deregister) + : ES(ES), Register(Register), Deregister(Deregister) { + DSOBaseName = ES.intern("__jitlink$libunwind_dso_base"); + } static Expected> - Create(IRLayer &IRL, JITDylib &PlatformJD); + Create(ExecutionSession &ES, ExecutorAddr Register, ExecutorAddr Deregister); - ~UnwindInfoRegistrationPlugin(); + static Expected> + Create(ExecutionSession &ES); void modifyPassConfig(MaterializationResponsibility &MR, jitlink::LinkGraph &G, @@ -49,20 +51,11 @@ class UnwindInfoRegistrationPlugin : public LinkGraphLinkingLayer::Plugin { ResourceKey SrcKey) override {} private: - UnwindInfoRegistrationPlugin(ExecutionSession &ES, ExecutorAddr Instance, - ExecutorAddr Disable, ExecutorAddr Register, - ExecutorAddr Deregister) - : ES(ES), Instance(Instance), Disable(Disable), Register(Register), - Deregister(Deregister) { - DSOBaseName = ES.intern("__jitlink$libunwind_dso_base"); - } - - static Expected makeBouncerModule(ExecutionSession &ES); Error addUnwindInfoRegistrationActions(jitlink::LinkGraph &G); ExecutionSession &ES; SymbolStringPtr DSOBaseName; - ExecutorAddr Instance, Disable, Register, Deregister; + ExecutorAddr Register, Deregister; }; } // namespace llvm::orc diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index 44a9a37c70597..f974cfc78c8dd 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -1269,6 +1269,7 @@ __OMP_TRAIT_PROPERTY(device, arch, x86_64) __OMP_TRAIT_PROPERTY(device, arch, amdgcn) __OMP_TRAIT_PROPERTY(device, arch, nvptx) __OMP_TRAIT_PROPERTY(device, arch, nvptx64) +__OMP_TRAIT_PROPERTY(device, arch, spirv64) __OMP_TRAIT_SET(target_device) @@ -1301,6 +1302,7 @@ __OMP_TRAIT_PROPERTY(target_device, arch, x86_64) __OMP_TRAIT_PROPERTY(target_device, arch, amdgcn) __OMP_TRAIT_PROPERTY(target_device, arch, nvptx) __OMP_TRAIT_PROPERTY(target_device, arch, nvptx64) +__OMP_TRAIT_PROPERTY(target_device, arch, spirv64) __OMP_TRAIT_SET(implementation) diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h index 8bee9f4703dd9..f6520fd855988 100644 --- a/llvm/include/llvm/IR/DIBuilder.h +++ b/llvm/include/llvm/IR/DIBuilder.h @@ -92,33 +92,15 @@ namespace llvm { /// Create an \a temporary node and track it in \a UnresolvedNodes. void trackIfUnresolved(MDNode *N); - /// Internal helper for insertDeclare. - DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo, - DIExpression *Expr, const DILocation *DL, - BasicBlock *InsertBB, Instruction *InsertBefore); - - /// Internal helper for insertLabel. - DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL, - BasicBlock *InsertBB, Instruction *InsertBefore); - /// Internal helper. Track metadata if untracked and insert \p DVR. - void insertDbgVariableRecord(DbgVariableRecord *DVR, BasicBlock *InsertBB, - Instruction *InsertBefore, - bool InsertAtHead = false); + void insertDbgVariableRecord(DbgVariableRecord *DVR, + InsertPosition InsertPt); /// Internal helper with common code used by insertDbg{Value,Addr}Intrinsic. Instruction *insertDbgIntrinsic(llvm::Function *Intrinsic, llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr, const DILocation *DL, - BasicBlock *InsertBB, - Instruction *InsertBefore); - - /// Internal helper for insertDbgValueIntrinsic. - DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val, - DILocalVariable *VarInfo, - DIExpression *Expr, const DILocation *DL, - BasicBlock *InsertBB, - Instruction *InsertBefore); + InsertPosition InsertPt); public: /// Construct a builder for a module. @@ -995,46 +977,28 @@ namespace llvm { /// \param VarInfo Variable's debug info descriptor. /// \param Expr A complex location expression. /// \param DL Debug info location. - /// \param InsertBefore Location for the new intrinsic. + /// \param InsertPt Location for the new intrinsic. DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo, DIExpression *Expr, const DILocation *DL, - Instruction *InsertBefore); + InsertPosition InsertPt); /// Insert a new llvm.dbg.label intrinsic call. /// \param LabelInfo Label's debug info descriptor. /// \param DL Debug info location. /// \param InsertBefore Location for the new intrinsic. DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL, - Instruction *InsertBefore); - - /// Insert a new llvm.dbg.label intrinsic call. - /// \param LabelInfo Label's debug info descriptor. - /// \param DL Debug info location. - /// \param InsertAtEnd Location for the new intrinsic. - DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL, - BasicBlock *InsertAtEnd); + InsertPosition InsertPt); /// Insert a new llvm.dbg.value intrinsic call. /// \param Val llvm::Value of the variable /// \param VarInfo Variable's debug info descriptor. /// \param Expr A complex location expression. /// \param DL Debug info location. - /// \param InsertAtEnd Location for the new intrinsic. - DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val, - DILocalVariable *VarInfo, - DIExpression *Expr, const DILocation *DL, - BasicBlock *InsertAtEnd); - - /// Insert a new llvm.dbg.value intrinsic call. - /// \param Val llvm::Value of the variable - /// \param VarInfo Variable's debug info descriptor. - /// \param Expr A complex location expression. - /// \param DL Debug info location. - /// \param InsertBefore Location for the new intrinsic. + /// \param InsertPt Location for the new intrinsic. DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr, const DILocation *DL, - Instruction *InsertBefore); + InsertPosition InsertPt); /// Replace the vtable holder in the given type. /// diff --git a/llvm/include/llvm/Passes/DroppedVariableStats.h b/llvm/include/llvm/IR/DroppedVariableStats.h similarity index 52% rename from llvm/include/llvm/Passes/DroppedVariableStats.h rename to llvm/include/llvm/IR/DroppedVariableStats.h index 30fbeae703b03..ebd74a69a8b91 100644 --- a/llvm/include/llvm/Passes/DroppedVariableStats.h +++ b/llvm/include/llvm/IR/DroppedVariableStats.h @@ -14,13 +14,19 @@ #ifndef LLVM_CODEGEN_DROPPEDVARIABLESTATS_H #define LLVM_CODEGEN_DROPPEDVARIABLESTATS_H -#include "llvm/IR/DebugInfoMetadata.h" -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/PassInstrumentation.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include namespace llvm { +class DIScope; +class DILocalVariable; +class Function; +class DILocation; +class DebugLoc; +class StringRef; + /// A unique key that represents a debug variable. /// First const DIScope *: Represents the scope of the debug variable. /// Second const DIScope *: Represents the InlinedAt scope of the debug @@ -33,13 +39,7 @@ using VarID = /// statistics. class DroppedVariableStats { public: - DroppedVariableStats(bool DroppedVarStatsEnabled) - : DroppedVariableStatsEnabled(DroppedVarStatsEnabled) { - if (DroppedVarStatsEnabled) - llvm::outs() - << "Pass Level, Pass Name, Num of Dropped Variables, Func or " - "Module Name\n"; - }; + DroppedVariableStats(bool DroppedVarStatsEnabled); virtual ~DroppedVariableStats() {} @@ -50,20 +50,9 @@ class DroppedVariableStats { bool getPassDroppedVariables() { return PassDroppedVariables; } protected: - void setup() { - DebugVariablesStack.push_back( - {DenseMap()}); - InlinedAts.push_back( - {DenseMap>()}); - } - - void cleanup() { - assert(!DebugVariablesStack.empty() && - "DebugVariablesStack shouldn't be empty!"); - assert(!InlinedAts.empty() && "InlinedAts shouldn't be empty!"); - DebugVariablesStack.pop_back(); - InlinedAts.pop_back(); - } + void setup(); + + void cleanup(); bool DroppedVariableStatsEnabled = false; struct DebugVariables { @@ -73,7 +62,6 @@ class DroppedVariableStats { DenseSet DebugVariablesAfter; }; -protected: /// A stack of a DenseMap, that maps DebugVariables for every pass to an /// llvm::Function. A stack is used because an optimization pass can call /// other passes. @@ -90,78 +78,27 @@ class DroppedVariableStats { void calculateDroppedStatsAndPrint(DebugVariables &DbgVariables, StringRef FuncName, StringRef PassID, StringRef FuncOrModName, - StringRef PassLevel, - const Function *Func) { - unsigned DroppedCount = 0; - DenseSet &DebugVariablesBeforeSet = - DbgVariables.DebugVariablesBefore; - DenseSet &DebugVariablesAfterSet = DbgVariables.DebugVariablesAfter; - auto It = InlinedAts.back().find(FuncName); - if (It == InlinedAts.back().end()) - return; - DenseMap &InlinedAtsMap = It->second; - // Find an Instruction that shares the same scope as the dropped #dbg_value - // or has a scope that is the child of the scope of the #dbg_value, and has - // an inlinedAt equal to the inlinedAt of the #dbg_value or it's inlinedAt - // chain contains the inlinedAt of the #dbg_value, if such an Instruction is - // found, debug information is dropped. - for (VarID Var : DebugVariablesBeforeSet) { - if (DebugVariablesAfterSet.contains(Var)) - continue; - visitEveryInstruction(DroppedCount, InlinedAtsMap, Var); - removeVarFromAllSets(Var, Func); - } - if (DroppedCount > 0) { - llvm::outs() << PassLevel << ", " << PassID << ", " << DroppedCount - << ", " << FuncOrModName << "\n"; - PassDroppedVariables = true; - } else - PassDroppedVariables = false; - } + StringRef PassLevel, const Function *Func); /// Check if a \p Var has been dropped or is a false positive. Also update the /// \p DroppedCount if a debug variable is dropped. bool updateDroppedCount(DILocation *DbgLoc, const DIScope *Scope, const DIScope *DbgValScope, DenseMap &InlinedAtsMap, - VarID Var, unsigned &DroppedCount) { - // If the Scope is a child of, or equal to the DbgValScope and is inlined at - // the Var's InlinedAt location, return true to signify that the Var has - // been dropped. - if (isScopeChildOfOrEqualTo(Scope, DbgValScope)) - if (isInlinedAtChildOfOrEqualTo(DbgLoc->getInlinedAt(), - InlinedAtsMap[Var])) { - // Found another instruction in the variable's scope, so there exists a - // break point at which the variable could be observed. Count it as - // dropped. - DroppedCount++; - return true; - } - return false; - } + VarID Var, unsigned &DroppedCount); + /// Run code to populate relevant data structures over an llvm::Function or /// llvm::MachineFunction. - void run(DebugVariables &DbgVariables, StringRef FuncName, bool Before) { - auto &VarIDSet = (Before ? DbgVariables.DebugVariablesBefore - : DbgVariables.DebugVariablesAfter); - auto &InlinedAtsMap = InlinedAts.back(); - if (Before) - InlinedAtsMap.try_emplace(FuncName, DenseMap()); - VarIDSet = DenseSet(); - visitEveryDebugRecord(VarIDSet, InlinedAtsMap, FuncName, Before); - } + void run(DebugVariables &DbgVariables, StringRef FuncName, bool Before); + /// Populate the VarIDSet and InlinedAtMap with the relevant information /// needed for before and after pass analysis to determine dropped variable /// status. void populateVarIDSetAndInlinedMap( const DILocalVariable *DbgVar, DebugLoc DbgLoc, DenseSet &VarIDSet, DenseMap> &InlinedAtsMap, - StringRef FuncName, bool Before) { - VarID Key{DbgVar->getScope(), DbgLoc->getInlinedAtScope(), DbgVar}; - VarIDSet.insert(Key); - if (Before) - InlinedAtsMap[FuncName].try_emplace(Key, DbgLoc.getInlinedAt()); - } + StringRef FuncName, bool Before); + /// Visit every llvm::Instruction or llvm::MachineInstruction and check if the /// debug variable denoted by its ID \p Var may have been dropped by an /// optimization pass. @@ -179,47 +116,18 @@ class DroppedVariableStats { private: /// Remove a dropped debug variable's VarID from all Sets in the /// DroppedVariablesBefore stack. - void removeVarFromAllSets(VarID Var, const Function *F) { - // Do not remove Var from the last element, it will be popped from the - // stack. - for (auto &DebugVariablesMap : llvm::drop_end(DebugVariablesStack)) - DebugVariablesMap[F].DebugVariablesBefore.erase(Var); - } + void removeVarFromAllSets(VarID Var, const Function *F); + /// Return true if \p Scope is the same as \p DbgValScope or a child scope of /// \p DbgValScope, return false otherwise. bool isScopeChildOfOrEqualTo(const DIScope *Scope, - const DIScope *DbgValScope) { - while (Scope != nullptr) { - if (VisitedScope.find(Scope) == VisitedScope.end()) { - VisitedScope.insert(Scope); - if (Scope == DbgValScope) { - VisitedScope.clear(); - return true; - } - Scope = Scope->getScope(); - } else { - VisitedScope.clear(); - return false; - } - } - return false; - } + const DIScope *DbgValScope); + /// Return true if \p InlinedAt is the same as \p DbgValInlinedAt or part of /// the InlinedAt chain, return false otherwise. bool isInlinedAtChildOfOrEqualTo(const DILocation *InlinedAt, - const DILocation *DbgValInlinedAt) { - if (DbgValInlinedAt == InlinedAt) - return true; - if (!DbgValInlinedAt) - return false; - auto *IA = InlinedAt; - while (IA) { - if (IA == DbgValInlinedAt) - return true; - IA = IA->getInlinedAt(); - } - return false; - } + const DILocation *DbgValInlinedAt); + bool PassDroppedVariables = false; }; diff --git a/llvm/include/llvm/Passes/DroppedVariableStatsIR.h b/llvm/include/llvm/IR/DroppedVariableStatsIR.h similarity index 74% rename from llvm/include/llvm/Passes/DroppedVariableStatsIR.h rename to llvm/include/llvm/IR/DroppedVariableStatsIR.h index 18847b5c1ead8..72b91dbc7ed52 100644 --- a/llvm/include/llvm/Passes/DroppedVariableStatsIR.h +++ b/llvm/include/llvm/IR/DroppedVariableStatsIR.h @@ -14,12 +14,17 @@ #ifndef LLVM_CODEGEN_DROPPEDVARIABLESTATSIR_H #define LLVM_CODEGEN_DROPPEDVARIABLESTATSIR_H -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/Module.h" -#include "llvm/Passes/DroppedVariableStats.h" +#include "llvm/IR/DroppedVariableStats.h" namespace llvm { +class Any; +class StringRef; +class PassInstrumentationCallbacks; +class Function; +class Module; +class DILocation; + /// A class to collect and print dropped debug information due to LLVM IR /// optimization passes. After every LLVM IR pass is run, it will print how many /// #dbg_values were dropped due to that pass. @@ -28,56 +33,42 @@ class DroppedVariableStatsIR : public DroppedVariableStats { DroppedVariableStatsIR(bool DroppedVarStatsEnabled) : llvm::DroppedVariableStats(DroppedVarStatsEnabled) {} - void runBeforePass(StringRef P, Any IR) { - setup(); - if (const auto *M = unwrapIR(IR)) - return this->runOnModule(P, M, true); - if (const auto *F = unwrapIR(IR)) - return this->runOnFunction(P, F, true); - } - - void runAfterPass(StringRef P, Any IR) { - if (const auto *M = unwrapIR(IR)) - runAfterPassModule(P, M); - else if (const auto *F = unwrapIR(IR)) - runAfterPassFunction(P, F); - cleanup(); - } + void runBeforePass(StringRef P, Any IR); + + void runAfterPass(StringRef P, Any IR); void registerCallbacks(PassInstrumentationCallbacks &PIC); private: const Function *Func; - void runAfterPassFunction(StringRef PassID, const Function *F) { - runOnFunction(PassID, F, false); - calculateDroppedVarStatsOnFunction(F, PassID, F->getName().str(), - "Function"); - } + void runAfterPassFunction(StringRef PassID, const Function *F); + + void runAfterPassModule(StringRef PassID, const Module *M); - void runAfterPassModule(StringRef PassID, const Module *M) { - runOnModule(PassID, M, false); - calculateDroppedVarStatsOnModule(M, PassID, M->getName().str(), "Module"); - } /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or /// after a pass has run to facilitate dropped variable calculation for an /// llvm::Function. void runOnFunction(StringRef PassID, const Function *F, bool Before); + /// Iterate over all Instructions in a Function and report any dropped debug /// information. void calculateDroppedVarStatsOnFunction(const Function *F, StringRef PassID, StringRef FuncOrModName, StringRef PassLevel); + /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or /// after a pass has run to facilitate dropped variable calculation for an /// llvm::Module. Calls runOnFunction on every Function in the Module. void runOnModule(StringRef PassID, const Module *M, bool Before); + /// Iterate over all Functions in a Module and report any dropped debug /// information. Will call calculateDroppedVarStatsOnFunction on every /// Function. void calculateDroppedVarStatsOnModule(const Module *M, StringRef PassID, StringRef FuncOrModName, StringRef PassLevel); + /// Override base class method to run on an llvm::Function specifically. virtual void visitEveryInstruction(unsigned &DroppedCount, @@ -90,10 +81,7 @@ class DroppedVariableStatsIR : public DroppedVariableStats { DenseMap> &InlinedAtsMap, StringRef FuncName, bool Before) override; - template static const IRUnitT *unwrapIR(Any IR) { - const IRUnitT **IRPtr = llvm::any_cast(&IR); - return IRPtr ? *IRPtr : nullptr; - } + template static const IRUnitT *unwrapIR(Any IR); }; } // namespace llvm diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index 833c91fd97461..1692f7dfb7fa7 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2708,12 +2708,17 @@ class IRBuilder : public IRBuilderBase { InserterTy Inserter; public: - IRBuilder(LLVMContext &C, FolderTy Folder, InserterTy Inserter = InserterTy(), + IRBuilder(LLVMContext &C, FolderTy Folder, InserterTy Inserter, MDNode *FPMathTag = nullptr, ArrayRef OpBundles = {}) : IRBuilderBase(C, this->Folder, this->Inserter, FPMathTag, OpBundles), Folder(Folder), Inserter(Inserter) {} + IRBuilder(LLVMContext &C, FolderTy Folder, MDNode *FPMathTag = nullptr, + ArrayRef OpBundles = {}) + : IRBuilderBase(C, this->Folder, this->Inserter, FPMathTag, OpBundles), + Folder(Folder) {} + explicit IRBuilder(LLVMContext &C, MDNode *FPMathTag = nullptr, ArrayRef OpBundles = {}) : IRBuilderBase(C, this->Folder, this->Inserter, FPMathTag, OpBundles) {} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index eb7bde6999491..d5d185ebc12f6 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -237,16 +237,16 @@ def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic< // the second one is copied to m0 def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], - [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; + [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback]>; def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], - [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrNoCallback]>; // gfx11 intrinsic // The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], - [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; + [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback]>; // Vanilla workgroup sync-barrier def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 6d74d7f24bf9a..b8df4d1ecab1d 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -209,7 +209,7 @@ void initializeMachinePipelinerPass(PassRegistry &); void initializeMachinePostDominatorTreeWrapperPassPass(PassRegistry &); void initializeMachineRegionInfoPassPass(PassRegistry &); void initializeMachineSanitizerBinaryMetadataPass(PassRegistry &); -void initializeMachineSchedulerPass(PassRegistry &); +void initializeMachineSchedulerLegacyPass(PassRegistry &); void initializeMachineSinkingPass(PassRegistry &); void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &); void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &); @@ -238,7 +238,7 @@ void initializePostDomPrinterWrapperPassPass(PassRegistry &); void initializePostDomViewerWrapperPassPass(PassRegistry &); void initializePostDominatorTreeWrapperPassPass(PassRegistry &); void initializePostInlineEntryExitInstrumenterPass(PassRegistry &); -void initializePostMachineSchedulerPass(PassRegistry &); +void initializePostMachineSchedulerLegacyPass(PassRegistry &); void initializePostRAHazardRecognizerPass(PassRegistry &); void initializePostRAMachineSinkingPass(PassRegistry &); void initializePostRASchedulerLegacyPass(PassRegistry &); diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h index 57ba40f7ac26f..e97c890ce9135 100644 --- a/llvm/include/llvm/MC/MCContext.h +++ b/llvm/include/llvm/MC/MCContext.h @@ -527,13 +527,6 @@ class MCContext { /// \name Section Management /// @{ - enum : unsigned { - /// Pass this value as the UniqueID during section creation to get the - /// generic section with the given name and characteristics. The usual - /// sections such as .text use this ID. - GenericSectionID = ~0U - }; - /// Return the MCSection for the specified mach-o section. This requires /// the operands to be valid. MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section, @@ -611,7 +604,7 @@ class MCContext { MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics, StringRef COMDATSymName, int Selection, - unsigned UniqueID = GenericSectionID); + unsigned UniqueID = MCSection::NonUniqueID); MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics); @@ -621,7 +614,7 @@ class MCContext { /// as Sec and the function symbol as KeySym. MCSectionCOFF * getAssociativeCOFFSection(MCSectionCOFF *Sec, const MCSymbol *KeySym, - unsigned UniqueID = GenericSectionID); + unsigned UniqueID = MCSection::NonUniqueID); MCSectionSPIRV *getSPIRVSection(); diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h index 9c86b4ecc6599..8065d17546d2e 100644 --- a/llvm/include/llvm/MC/MCELFStreamer.h +++ b/llvm/include/llvm/MC/MCELFStreamer.h @@ -105,7 +105,7 @@ class MCELFStreamer : public MCObjectStreamer { unsigned IntValue; std::string StringValue; AttributeItem(Types Ty, unsigned Tg, unsigned IV, std::string SV) - : Type(Ty), Tag(Tg), IntValue(IV), StringValue(SV) {} + : Type(Ty), Tag(Tg), IntValue(IV), StringValue(std::move(SV)) {} }; /// ELF object attributes subsection support diff --git a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h index 4c88448e6a128..443877391072c 100644 --- a/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h +++ b/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h @@ -122,15 +122,13 @@ struct ParseInstructionInfo { : AsmRewrites(rewrites) {} }; -enum OperandMatchResultTy { - MatchOperand_Success, // operand matched successfully - MatchOperand_NoMatch, // operand did not match - MatchOperand_ParseFail // operand matched but had errors -}; - /// Ternary parse status returned by various parse* methods. class ParseStatus { - enum class StatusTy { Success, Failure, NoMatch } Status; + enum class StatusTy { + Success, // Parsing Succeeded + Failure, // Parsing Failed after consuming some tokens + NoMatch, // Parsing Failed without consuming any tokens + } Status; public: #if __cplusplus >= 202002L @@ -152,19 +150,6 @@ class ParseStatus { constexpr bool isSuccess() const { return Status == StatusTy::Success; } constexpr bool isFailure() const { return Status == StatusTy::Failure; } constexpr bool isNoMatch() const { return Status == StatusTy::NoMatch; } - - // Allow implicit conversions to / from OperandMatchResultTy. - LLVM_DEPRECATED("Migrate to ParseStatus", "") - constexpr ParseStatus(OperandMatchResultTy R) - : Status(R == MatchOperand_Success ? Success - : R == MatchOperand_ParseFail ? Failure - : NoMatch) {} - LLVM_DEPRECATED("Migrate to ParseStatus", "") - constexpr operator OperandMatchResultTy() const { - return isSuccess() ? MatchOperand_Success - : isFailure() ? MatchOperand_ParseFail - : MatchOperand_NoMatch; - } }; enum class DiagnosticPredicateTy { diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 7f91dd7ebf49d..1458318ff021a 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -50,6 +50,7 @@ #include "llvm/CodeGen/MachineLICM.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePassManager.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/MachineVerifier.h" #include "llvm/CodeGen/OptimizePHIs.h" #include "llvm/CodeGen/PHIElimination.h" @@ -960,7 +961,7 @@ Error CodeGenPassBuilder::addMachinePasses( if (getOptLevel() != CodeGenOptLevel::None && !TM.targetSchedulesPostRAScheduling()) { if (Opt.MISchedPostRA) - addPass(PostMachineSchedulerPass()); + addPass(PostMachineSchedulerPass(&TM)); else addPass(PostRASchedulerPass(&TM)); } @@ -1144,7 +1145,7 @@ void CodeGenPassBuilder::addOptimizedRegAlloc( addPass(RenameIndependentSubregsPass()); // PreRA instruction scheduling. - addPass(MachineSchedulerPass()); + addPass(MachineSchedulerPass(&TM)); if (derived().addRegAssignmentOptimized(addPass)) { // Allow targets to expand pseudo instructions depending on the choice of diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 9f9922dfa5673..075ebcb829553 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -142,12 +142,14 @@ MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass()) MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass()) MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass()) MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass()) +MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass(TM)) MACHINE_FUNCTION_PASS("machinelicm", MachineLICMPass()) MACHINE_FUNCTION_PASS("no-op-machine-function", NoOpMachineFunctionPass()) MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass()) MACHINE_FUNCTION_PASS("peephole-opt", PeepholeOptimizerPass()) MACHINE_FUNCTION_PASS("phi-node-elimination", PHIEliminationPass()) MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass(TM)) +MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass(TM)) MACHINE_FUNCTION_PASS("print", PrintMIRPass()) MACHINE_FUNCTION_PASS("print", LiveDebugVariablesPrinterPass(errs())) MACHINE_FUNCTION_PASS("print", LiveIntervalsPrinterPass(errs())) @@ -243,13 +245,11 @@ DUMMY_MACHINE_FUNCTION_PASS("static-data-splitter", StaticDataSplitter) DUMMY_MACHINE_FUNCTION_PASS("machine-function-splitter", MachineFunctionSplitterPass) DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass) DUMMY_MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadata) -DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass) DUMMY_MACHINE_FUNCTION_PASS("machine-sink", MachineSinkingPass) DUMMY_MACHINE_FUNCTION_PASS("machine-uniformity", MachineUniformityInfoWrapperPass) DUMMY_MACHINE_FUNCTION_PASS("machineinstr-printer", MachineFunctionPrinterPass) DUMMY_MACHINE_FUNCTION_PASS("mirfs-discriminators", MIRAddFSDiscriminatorsPass) DUMMY_MACHINE_FUNCTION_PASS("patchable-function", PatchableFunctionPass) -DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass) DUMMY_MACHINE_FUNCTION_PASS("postra-machine-sink", PostRAMachineSinkingPass) DUMMY_MACHINE_FUNCTION_PASS("postrapseudos", ExpandPostRAPseudosPass) DUMMY_MACHINE_FUNCTION_PASS("print-machine-cycles", MachineCycleInfoPrinterPass) diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h index 4e62ee9c00daf..2af73cb714a76 100644 --- a/llvm/include/llvm/Passes/StandardInstrumentations.h +++ b/llvm/include/llvm/Passes/StandardInstrumentations.h @@ -22,10 +22,10 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DroppedVariableStatsIR.h" #include "llvm/IR/OptBisect.h" #include "llvm/IR/PassTimingInfo.h" #include "llvm/IR/ValueHandle.h" -#include "llvm/Passes/DroppedVariableStatsIR.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Transforms/IPO/SampleProfileProbe.h" diff --git a/llvm/include/llvm/Support/ModRef.h b/llvm/include/llvm/Support/ModRef.h index a8ce9a8e6e69c..7f58f5236aedd 100644 --- a/llvm/include/llvm/Support/ModRef.h +++ b/llvm/include/llvm/Support/ModRef.h @@ -61,8 +61,10 @@ enum class IRMemLocation { ArgMem = 0, /// Memory that is inaccessible via LLVM IR. InaccessibleMem = 1, + /// Errno memory. + ErrnoMem = 2, /// Any other memory. - Other = 2, + Other = 3, /// Helpers to iterate all locations in the MemoryEffectsBase class. First = ArgMem, @@ -139,6 +141,16 @@ template class MemoryEffectsBase { return MemoryEffectsBase(Location::InaccessibleMem, MR); } + /// Create MemoryEffectsBase that can only access errno memory. + static MemoryEffectsBase errnoMemOnly(ModRefInfo MR = ModRefInfo::ModRef) { + return MemoryEffectsBase(Location::ErrnoMem, MR); + } + + /// Create MemoryEffectsBase that can only access other memory. + static MemoryEffectsBase otherMemOnly(ModRefInfo MR = ModRefInfo::ModRef) { + return MemoryEffectsBase(Location::Other, MR); + } + /// Create MemoryEffectsBase that can only access inaccessible or argument /// memory. static MemoryEffectsBase @@ -212,6 +224,11 @@ template class MemoryEffectsBase { return getWithoutLoc(Location::InaccessibleMem).doesNotAccessMemory(); } + /// Whether this function only (at most) accesses errno memory. + bool onlyAccessesErrnoMem() const { + return getWithoutLoc(Location::ErrnoMem).doesNotAccessMemory(); + } + /// Whether this function only (at most) accesses argument and inaccessible /// memory. bool onlyAccessesInaccessibleOrArgMem() const { @@ -309,6 +326,10 @@ inline bool capturesFullProvenance(CaptureComponents CC) { return (CC & CaptureComponents::Provenance) == CaptureComponents::Provenance; } +inline bool capturesAll(CaptureComponents CC) { + return CC == CaptureComponents::All; +} + raw_ostream &operator<<(raw_ostream &OS, CaptureComponents CC); /// Represents which components of the pointer may be captured in which @@ -333,6 +354,15 @@ class CaptureInfo { /// Create CaptureInfo that may capture all components of the pointer. static CaptureInfo all() { return CaptureInfo(CaptureComponents::All); } + /// Create CaptureInfo that may only capture via the return value. + static CaptureInfo + retOnly(CaptureComponents RetComponents = CaptureComponents::All) { + return CaptureInfo(CaptureComponents::None, RetComponents); + } + + /// Whether the pointer is only captured via the return value. + bool isRetOnly() const { return capturesNothing(OtherComponents); } + /// Get components potentially captured by the return value. CaptureComponents getRetComponents() const { return RetComponents; } diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h index d86630decd3ae..44cb5183fb31d 100644 --- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h +++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h @@ -881,19 +881,21 @@ void SampleProfileLoaderBaseImpl::buildEdges(FunctionT &F) { // Add predecessors for B1. SmallPtrSet Visited; - if (!Predecessors[B1].empty()) + auto &Preds = Predecessors[B1]; + if (!Preds.empty()) llvm_unreachable("Found a stale predecessors list in a basic block."); for (auto *B2 : getPredecessors(B1)) if (Visited.insert(B2).second) - Predecessors[B1].push_back(B2); + Preds.push_back(B2); // Add successors for B1. Visited.clear(); - if (!Successors[B1].empty()) + auto &Succs = Successors[B1]; + if (!Succs.empty()) llvm_unreachable("Found a stale successors list in a basic block."); for (auto *B2 : getSuccessors(B1)) if (Visited.insert(B2).second) - Successors[B1].push_back(B2); + Succs.push_back(B2); } } diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h index c931319d3b002..9bdf940fc77b7 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h @@ -18,6 +18,7 @@ #include "llvm/SandboxIR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h" #include namespace llvm::sandboxir { @@ -85,11 +86,13 @@ class InstrMaps { /// Update the map to reflect that \p Origs got vectorized into \p Vec. void registerVector(ArrayRef Origs, Value *Vec) { auto &OrigToLaneMap = VectorToOrigLaneMap[Vec]; - for (auto [Lane, Orig] : enumerate(Origs)) { + unsigned Lane = 0; + for (Value *Orig : Origs) { auto Pair = OrigToVectorMap.try_emplace(Orig, Vec); assert(Pair.second && "Orig already exists in the map!"); (void)Pair; OrigToLaneMap[Orig] = Lane; + Lane += VecUtils::getNumLanes(Orig); } } void clear() { diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h index 0fa40e00d23fc..6c2315af0e797 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h @@ -30,8 +30,23 @@ namespace llvm::sandboxir { class PriorityCmp { public: bool operator()(const DGNode *N1, const DGNode *N2) { - // TODO: This should be a hierarchical comparator. - return N1->getInstruction()->comesBefore(N2->getInstruction()); + // Given that the DAG does not model dependencies such that PHIs are always + // at the top, or terminators always at the bottom, we need to force the + // priority here in the comparator of the ready list container. + auto *I1 = N1->getInstruction(); + auto *I2 = N2->getInstruction(); + bool IsTerm1 = I1->isTerminator(); + bool IsTerm2 = I2->isTerminator(); + if (IsTerm1 != IsTerm2) + // Terminators have the lowest priority. + return IsTerm1 > IsTerm2; + bool IsPHI1 = isa(I1); + bool IsPHI2 = isa(I2); + if (IsPHI1 != IsPHI2) + // PHIs have the highest priority. + return IsPHI1 < IsPHI2; + // Otherwise rely on the instruction order. + return I2->comesBefore(I1); } }; diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index 49baf2eb84bb3..5120b910e7896 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -81,14 +81,15 @@ struct SimpleCaptureTracker : public CaptureTracker { Captured = true; } - bool captured(const Use *U) override { + Action captured(const Use *U, UseCaptureInfo CI) override { + // TODO(captures): Use UseCaptureInfo. if (isa(U->getUser()) && !ReturnCaptures) - return false; + return ContinueIgnoringReturn; LLVM_DEBUG(dbgs() << "Captured by: " << *U->getUser() << "\n"); Captured = true; - return true; + return Stop; } bool ReturnCaptures; @@ -122,19 +123,21 @@ struct CapturesBefore : public CaptureTracker { return !isPotentiallyReachable(I, BeforeHere, nullptr, DT, LI); } - bool captured(const Use *U) override { + Action captured(const Use *U, UseCaptureInfo CI) override { + // TODO(captures): Use UseCaptureInfo. Instruction *I = cast(U->getUser()); if (isa(I) && !ReturnCaptures) - return false; + return ContinueIgnoringReturn; // Check isSafeToPrune() here rather than in shouldExplore() to avoid // an expensive reachability query for every instruction we look at. // Instead we only do one for actual capturing candidates. if (isSafeToPrune(I)) - return false; + // If the use is not reachable, the instruction result isn't either. + return ContinueIgnoringReturn; Captured = true; - return true; + return Stop; } const Instruction *BeforeHere; @@ -166,10 +169,11 @@ struct EarliestCaptures : public CaptureTracker { EarliestCapture = &*F.getEntryBlock().begin(); } - bool captured(const Use *U) override { + Action captured(const Use *U, UseCaptureInfo CI) override { + // TODO(captures): Use UseCaptureInfo. Instruction *I = cast(U->getUser()); if (isa(I) && !ReturnCaptures) - return false; + return ContinueIgnoringReturn; if (!EarliestCapture) EarliestCapture = I; @@ -177,9 +181,10 @@ struct EarliestCaptures : public CaptureTracker { EarliestCapture = DT.findNearestCommonDominator(EarliestCapture, I); Captured = true; - // Return false to continue analysis; we need to see all potential - // captures. - return false; + // Continue analysis, as we need to see all potential captures. However, + // we do not need to follow the instruction result, as this use will + // dominate any captures made through the instruction result.. + return ContinueIgnoringReturn; } Instruction *EarliestCapture = nullptr; @@ -274,25 +279,26 @@ Instruction *llvm::FindEarliestCapture(const Value *V, Function &F, return CB.EarliestCapture; } -UseCaptureKind llvm::DetermineUseCaptureKind( - const Use &U, +UseCaptureInfo llvm::DetermineUseCaptureKind( + const Use &U, const Value *Base, function_ref IsDereferenceableOrNull) { Instruction *I = dyn_cast(U.getUser()); // TODO: Investigate non-instruction uses. if (!I) - return UseCaptureKind::MAY_CAPTURE; + return CaptureComponents::All; switch (I->getOpcode()) { case Instruction::Call: case Instruction::Invoke: { + // TODO(captures): Make this more precise. auto *Call = cast(I); // Not captured if the callee is readonly, doesn't return a copy through // its return value and doesn't unwind (a readonly function can leak bits // by throwing an exception or not depending on the input value). if (Call->onlyReadsMemory() && Call->doesNotThrow() && Call->getType()->isVoidTy()) - return UseCaptureKind::NO_CAPTURE; + return CaptureComponents::None; // The pointer is not captured if returned pointer is not captured. // NOTE: CaptureTracking users should not assume that only functions @@ -300,13 +306,13 @@ UseCaptureKind llvm::DetermineUseCaptureKind( // getUnderlyingObject in ValueTracking or DecomposeGEPExpression // in BasicAA also need to know about this property. if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call, true)) - return UseCaptureKind::PASSTHROUGH; + return UseCaptureInfo::passthrough(); // Volatile operations effectively capture the memory location that they // load and store to. if (auto *MI = dyn_cast(Call)) if (MI->isVolatile()) - return UseCaptureKind::MAY_CAPTURE; + return CaptureComponents::All; // Calling a function pointer does not in itself cause the pointer to // be captured. This is a subtle point considering that (for example) @@ -315,30 +321,27 @@ UseCaptureKind llvm::DetermineUseCaptureKind( // captured, even though the loaded value might be the pointer itself // (think of self-referential objects). if (Call->isCallee(&U)) - return UseCaptureKind::NO_CAPTURE; + return CaptureComponents::None; // Not captured if only passed via 'nocapture' arguments. assert(Call->isDataOperand(&U) && "Non-callee must be data operand"); - if (!Call->doesNotCapture(Call->getDataOperandNo(&U))) { - // The parameter is not marked 'nocapture' - captured. - return UseCaptureKind::MAY_CAPTURE; - } - return UseCaptureKind::NO_CAPTURE; + CaptureInfo CI = Call->getCaptureInfo(Call->getDataOperandNo(&U)); + return UseCaptureInfo(CI.getOtherComponents(), CI.getRetComponents()); } case Instruction::Load: // Volatile loads make the address observable. if (cast(I)->isVolatile()) - return UseCaptureKind::MAY_CAPTURE; - return UseCaptureKind::NO_CAPTURE; + return CaptureComponents::All; + return CaptureComponents::None; case Instruction::VAArg: // "va-arg" from a pointer does not cause it to be captured. - return UseCaptureKind::NO_CAPTURE; + return CaptureComponents::None; case Instruction::Store: // Stored the pointer - conservatively assume it may be captured. // Volatile stores make the address observable. if (U.getOperandNo() == 0 || cast(I)->isVolatile()) - return UseCaptureKind::MAY_CAPTURE; - return UseCaptureKind::NO_CAPTURE; + return CaptureComponents::All; + return CaptureComponents::None; case Instruction::AtomicRMW: { // atomicrmw conceptually includes both a load and store from // the same location. @@ -347,8 +350,8 @@ UseCaptureKind llvm::DetermineUseCaptureKind( // Volatile stores make the address observable. auto *ARMWI = cast(I); if (U.getOperandNo() == 1 || ARMWI->isVolatile()) - return UseCaptureKind::MAY_CAPTURE; - return UseCaptureKind::NO_CAPTURE; + return CaptureComponents::All; + return CaptureComponents::None; } case Instruction::AtomicCmpXchg: { // cmpxchg conceptually includes both a load and store from @@ -358,31 +361,35 @@ UseCaptureKind llvm::DetermineUseCaptureKind( // Volatile stores make the address observable. auto *ACXI = cast(I); if (U.getOperandNo() == 1 || U.getOperandNo() == 2 || ACXI->isVolatile()) - return UseCaptureKind::MAY_CAPTURE; - return UseCaptureKind::NO_CAPTURE; + return CaptureComponents::All; + return CaptureComponents::None; } case Instruction::GetElementPtr: // AA does not support pointers of vectors, so GEP vector splats need to // be considered as captures. if (I->getType()->isVectorTy()) - return UseCaptureKind::MAY_CAPTURE; - return UseCaptureKind::PASSTHROUGH; + return CaptureComponents::All; + return UseCaptureInfo::passthrough(); case Instruction::BitCast: case Instruction::PHI: case Instruction::Select: case Instruction::AddrSpaceCast: // The original value is not captured via this if the new value isn't. - return UseCaptureKind::PASSTHROUGH; + return UseCaptureInfo::passthrough(); case Instruction::ICmp: { unsigned Idx = U.getOperandNo(); unsigned OtherIdx = 1 - Idx; - if (auto *CPN = dyn_cast(I->getOperand(OtherIdx))) { + if (isa(I->getOperand(OtherIdx)) && + cast(I)->isEquality()) { + // TODO(captures): Remove these special cases once we make use of + // captures(address_is_null). + // Don't count comparisons of a no-alias return value against null as // captures. This allows us to ignore comparisons of malloc results // with null, for example. - if (CPN->getType()->getAddressSpace() == 0) + if (U->getType()->getPointerAddressSpace() == 0) if (isNoAliasCall(U.get()->stripPointerCasts())) - return UseCaptureKind::NO_CAPTURE; + return CaptureComponents::None; if (!I->getFunction()->nullPointerIsDefined()) { auto *O = I->getOperand(Idx)->stripPointerCastsSameRepresentation(); // Comparing a dereferenceable_or_null pointer against null cannot @@ -390,17 +397,23 @@ UseCaptureKind llvm::DetermineUseCaptureKind( // valid (in-bounds) pointer. const DataLayout &DL = I->getDataLayout(); if (IsDereferenceableOrNull && IsDereferenceableOrNull(O, DL)) - return UseCaptureKind::NO_CAPTURE; + return CaptureComponents::None; } + + // Check whether this is a comparison of the base pointer against + // null. + if (U.get() == Base) + return CaptureComponents::AddressIsNull; } // Otherwise, be conservative. There are crazy ways to capture pointers - // using comparisons. - return UseCaptureKind::MAY_CAPTURE; + // using comparisons. However, only the address is captured, not the + // provenance. + return CaptureComponents::Address; } default: // Something else - be conservative and say it is captured. - return UseCaptureKind::MAY_CAPTURE; + return CaptureComponents::All; } } @@ -438,18 +451,26 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, }; while (!Worklist.empty()) { const Use *U = Worklist.pop_back_val(); - switch (DetermineUseCaptureKind(*U, IsDereferenceableOrNull)) { - case UseCaptureKind::NO_CAPTURE: - continue; - case UseCaptureKind::MAY_CAPTURE: - if (Tracker->captured(U)) - return; - continue; - case UseCaptureKind::PASSTHROUGH: - if (!AddUses(U->getUser())) + UseCaptureInfo CI = DetermineUseCaptureKind(*U, V, IsDereferenceableOrNull); + if (capturesAnything(CI.UseCC)) { + switch (Tracker->captured(U, CI)) { + case CaptureTracker::Stop: return; - continue; + case CaptureTracker::ContinueIgnoringReturn: + continue; + case CaptureTracker::Continue: + // Fall through to passthrough handling, but only if ResultCC contains + // additional components that UseCC does not. We assume that a + // capture at this point will be strictly more constraining than a + // later capture from following the return value. + if (capturesNothing(CI.ResultCC & ~CI.UseCC)) + continue; + break; + } } + // TODO(captures): We could keep track of ResultCC for the users. + if (capturesAnything(CI.ResultCC) && !AddUses(U->getUser())) + return; } // All uses examined. diff --git a/llvm/lib/Analysis/DDG.cpp b/llvm/lib/Analysis/DDG.cpp index a0774096c5129..0907a7fb021fc 100644 --- a/llvm/lib/Analysis/DDG.cpp +++ b/llvm/lib/Analysis/DDG.cpp @@ -241,9 +241,10 @@ bool DataDependenceGraph::addNode(DDGNode &N) { } const PiBlockDDGNode *DataDependenceGraph::getPiBlock(const NodeType &N) const { - if (!PiBlockMap.contains(&N)) + auto It = PiBlockMap.find(&N); + if (It == PiBlockMap.end()) return nullptr; - auto *Pi = PiBlockMap.find(&N)->second; + auto *Pi = It->second; assert(!PiBlockMap.contains(Pi) && "Nested pi-blocks detected."); return Pi; } diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 6ce2875beecca..cd252c62ba9cd 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -187,7 +187,7 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, if (DstI->mayReadOrWriteMemory()) { OS << "Src:" << *SrcI << " --> Dst:" << *DstI << "\n"; OS << " da analyze - "; - if (auto D = DA->depends(&*SrcI, &*DstI, true)) { + if (auto D = DA->depends(&*SrcI, &*DstI)) { // Normalize negative direction vectors if required by clients. if (NormalizeResults && D->normalize(&SE)) OS << "normalized - "; @@ -3589,8 +3589,8 @@ bool DependenceInfo::invalidate(Function &F, const PreservedAnalyses &PA, // Care is required to keep the routine below, getSplitIteration(), // up to date with respect to this routine. std::unique_ptr -DependenceInfo::depends(Instruction *Src, Instruction *Dst, - bool PossiblyLoopIndependent) { +DependenceInfo::depends(Instruction *Src, Instruction *Dst) { + bool PossiblyLoopIndependent = true; if (Src == Dst) PossiblyLoopIndependent = false; diff --git a/llvm/lib/Analysis/DependenceGraphBuilder.cpp b/llvm/lib/Analysis/DependenceGraphBuilder.cpp index c076e52ce6e14..5664e2f27a61a 100644 --- a/llvm/lib/Analysis/DependenceGraphBuilder.cpp +++ b/llvm/lib/Analysis/DependenceGraphBuilder.cpp @@ -295,7 +295,7 @@ void AbstractDependenceGraphBuilder::createMemoryDependencyEdges() { bool BackwardEdgeCreated = false; for (Instruction *ISrc : SrcIList) { for (Instruction *IDst : DstIList) { - auto D = DI.depends(ISrc, IDst, true); + auto D = DI.depends(ISrc, IDst); if (!D) continue; diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 59002cd934ab1..d25c1eecaf1ca 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2788,7 +2788,8 @@ static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS, struct CustomCaptureTracker : public CaptureTracker { bool Captured = false; void tooManyUses() override { Captured = true; } - bool captured(const Use *U) override { + Action captured(const Use *U, UseCaptureInfo CI) override { + // TODO(captures): Use UseCaptureInfo. if (auto *ICmp = dyn_cast(U->getUser())) { // Comparison against value stored in global variable. Given the // pointer does not escape, its value cannot be guessed and stored @@ -2796,11 +2797,11 @@ static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS, unsigned OtherIdx = 1 - U->getOperandNo(); auto *LI = dyn_cast(ICmp->getOperand(OtherIdx)); if (LI && isa(LI->getPointerOperand())) - return false; + return Continue; } Captured = true; - return true; + return Stop; } }; CustomCaptureTracker Tracker; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 3202ba81be78e..d24a48da8c589 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -2999,20 +2999,12 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, : PSE(std::make_unique(*SE, *L)), PtrRtChecking(nullptr), TheLoop(L) { unsigned MaxTargetVectorWidthInBits = std::numeric_limits::max(); - if (TTI) { - TypeSize FixedWidth = - TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); - if (FixedWidth.isNonZero()) { - // Scale the vector width by 2 as rough estimate to also consider - // interleaving. - MaxTargetVectorWidthInBits = FixedWidth.getFixedValue() * 2; - } + if (TTI && !TTI->enableScalableVectorization()) + // Scale the vector width by 2 as rough estimate to also consider + // interleaving. + MaxTargetVectorWidthInBits = + TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2; - TypeSize ScalableWidth = - TTI->getRegisterBitWidth(TargetTransformInfo::RGK_ScalableVector); - if (ScalableWidth.isNonZero()) - MaxTargetVectorWidthInBits = std::numeric_limits::max(); - } DepChecker = std::make_unique(*PSE, L, SymbolicStrides, MaxTargetVectorWidthInBits); PtrRtChecking = std::make_unique(*DepChecker, SE); diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp index 2897b922f61e4..050c32707596a 100644 --- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp +++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp @@ -224,7 +224,7 @@ IndexedReference::hasTemporalReuse(const IndexedReference &Other, } std::unique_ptr D = - DI.depends(&StoreOrLoadInst, &Other.StoreOrLoadInst, true); + DI.depends(&StoreOrLoadInst, &Other.StoreOrLoadInst); if (D == nullptr) { LLVM_DEBUG(dbgs().indent(2) << "No temporal reuse: no dependence\n"); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index fb744d61aad63..2a49a10447e0b 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -797,10 +797,28 @@ static void computeKnownBitsFromCond(const Value *V, Value *Cond, else Known2 = Known2.intersectWith(Known3); Known = Known.unionWith(Known2); + return; } - if (auto *Cmp = dyn_cast(Cond)) + if (auto *Cmp = dyn_cast(Cond)) { computeKnownBitsFromICmpCond(V, Cmp, Known, SQ, Invert); + return; + } + + if (match(Cond, m_Trunc(m_Specific(V)))) { + KnownBits DstKnown(1); + if (Invert) { + DstKnown.setAllZero(); + } else { + DstKnown.setAllOnes(); + } + if (cast(Cond)->hasNoUnsignedWrap()) { + Known = Known.unionWith(DstKnown.zext(Known.getBitWidth())); + return; + } + Known = Known.unionWith(DstKnown.anyext(Known.getBitWidth())); + return; + } if (Depth < MaxAnalysisRecursionDepth && match(Cond, m_Not(m_Value(A)))) computeKnownBitsFromCond(V, A, Known, Depth + 1, SQ, !Invert); @@ -3839,6 +3857,50 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2, match(V2, m_PtrToIntSameSize(Q.DL, m_Value(B)))) return isKnownNonEqual(A, B, DemandedElts, Depth + 1, Q); + if (!Q.CxtI) + return false; + + // Try to infer NonEqual based on information from dominating conditions. + if (Q.DC && Q.DT) { + for (BranchInst *BI : Q.DC->conditionsFor(V1)) { + Value *Cond = BI->getCondition(); + BasicBlockEdge Edge0(BI->getParent(), BI->getSuccessor(0)); + if (Q.DT->dominates(Edge0, Q.CxtI->getParent()) && + isImpliedCondition(Cond, ICmpInst::ICMP_NE, V1, V2, Q.DL, + /*LHSIsTrue=*/true, Depth) + .value_or(false)) + return true; + + BasicBlockEdge Edge1(BI->getParent(), BI->getSuccessor(1)); + if (Q.DT->dominates(Edge1, Q.CxtI->getParent()) && + isImpliedCondition(Cond, ICmpInst::ICMP_NE, V1, V2, Q.DL, + /*LHSIsTrue=*/false, Depth) + .value_or(false)) + return true; + } + } + + if (!Q.AC) + return false; + + // Try to infer NonEqual based on information from assumptions. + for (auto &AssumeVH : Q.AC->assumptionsFor(V1)) { + if (!AssumeVH) + continue; + CallInst *I = cast(AssumeVH); + + assert(I->getFunction() == Q.CxtI->getFunction() && + "Got assumption for the wrong function!"); + assert(I->getIntrinsicID() == Intrinsic::assume && + "must be an assume intrinsic"); + + if (isImpliedCondition(I->getArgOperand(0), ICmpInst::ICMP_NE, V1, V2, Q.DL, + /*LHSIsTrue=*/true, Depth) + .value_or(false) && + isValidAssumeForContext(I, Q.CxtI, Q.DT)) + return true; + } + return false; } @@ -10213,10 +10275,10 @@ void llvm::findValuesAffectedByCondition( Worklist.push_back(B); } } else if (match(V, m_ICmp(Pred, m_Value(A), m_Value(B)))) { - AddCmpOperands(A, B); - bool HasRHSC = match(B, m_ConstantInt()); if (ICmpInst::isEquality(Pred)) { + AddAffected(A); + AddAffected(B); if (HasRHSC) { Value *Y; // (X & C) or (X | C). @@ -10230,6 +10292,7 @@ void llvm::findValuesAffectedByCondition( } } } else { + AddCmpOperands(A, B); if (HasRHSC) { // Handle (A + C1) u< C2, which is the canonical form of // A > C3 && A < C4. @@ -10280,6 +10343,10 @@ void llvm::findValuesAffectedByCondition( m_Value()))) { // Handle patterns that computeKnownFPClass() support. AddAffected(A); + } else if (!IsAssume && match(V, m_Trunc(m_Value(X)))) { + // Assume is checked here as X is already added above for assumes in + // addValueAffectedByCondition + AddAffected(X); } else if (!IsAssume && match(V, m_Not(m_Value(X)))) { // Assume is checked here to avoid issues with ephemeral values Worklist.push_back(X); diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 438824f84e2d0..c867a68518e4d 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -701,6 +701,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(readwrite); KEYWORD(argmem); KEYWORD(inaccessiblemem); + KEYWORD(errnomem); KEYWORD(argmemonly); KEYWORD(inaccessiblememonly); KEYWORD(inaccessiblemem_or_argmemonly); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index ad52a9f493eae..0817851bd408a 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -2497,6 +2497,8 @@ static std::optional keywordToLoc(lltok::Kind Tok) { return IRMemLocation::ArgMem; case lltok::kw_inaccessiblemem: return IRMemLocation::InaccessibleMem; + case lltok::kw_errnomem: + return IRMemLocation::ErrnoMem; default: return std::nullopt; } @@ -2545,7 +2547,7 @@ std::optional LLParser::parseMemoryAttr() { std::optional MR = keywordToModRef(Lex.getKind()); if (!MR) { if (!Loc) - tokError("expected memory location (argmem, inaccessiblemem) " + tokError("expected memory location (argmem, inaccessiblemem, errnomem) " "or access kind (none, read, write, readwrite)"); else tokError("expected access kind (none, read, write, readwrite)"); diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 1a09e80c4fbb2..d687495c42de6 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1937,8 +1937,7 @@ static void addRawAttributeValue(AttrBuilder &B, uint64_t Val) { } /// This fills an AttrBuilder object with the LLVM attributes that have -/// been decoded from the given integer. This function must stay in sync with -/// 'encodeLLVMAttributesForBitcode'. +/// been decoded from the given integer. static void decodeLLVMAttributesForBitcode(AttrBuilder &B, uint64_t EncodedAttrs, uint64_t AttrIdx) { @@ -2398,9 +2397,28 @@ Error BitcodeReader::parseAttributeGroupBlock() { B.addUWTableAttr(UWTableKind(Record[++i])); else if (Kind == Attribute::AllocKind) B.addAllocKindAttr(static_cast(Record[++i])); - else if (Kind == Attribute::Memory) - B.addMemoryAttr(MemoryEffects::createFromIntValue(Record[++i])); - else if (Kind == Attribute::Captures) + else if (Kind == Attribute::Memory) { + uint64_t EncodedME = Record[++i]; + const uint8_t Version = (EncodedME >> 56); + if (Version == 0) { + // Errno memory location was previously encompassed into default + // memory. Ensure this is taken into account while reconstructing + // the memory attribute prior to its introduction. + ModRefInfo ArgMem = ModRefInfo((EncodedME >> 0) & 3); + ModRefInfo InaccessibleMem = ModRefInfo((EncodedME >> 2) & 3); + ModRefInfo OtherMem = ModRefInfo((EncodedME >> 4) & 3); + auto ME = MemoryEffects::inaccessibleMemOnly(InaccessibleMem) | + MemoryEffects::argMemOnly(ArgMem) | + MemoryEffects::errnoMemOnly(OtherMem) | + MemoryEffects::otherMemOnly(OtherMem); + B.addMemoryAttr(ME); + } else { + // Construct the memory attribute directly from the encoded base + // on newer versions. + B.addMemoryAttr(MemoryEffects::createFromIntValue( + EncodedME & 0x00FFFFFFFFFFFFFFULL)); + } + } else if (Kind == Attribute::Captures) B.addCapturesAttr(CaptureInfo::createFromIntValue(Record[++i])); else if (Kind == Attribute::NoFPClass) B.addNoFPClassAttr( diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 7ca63c2c7251d..450b8066540e5 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -977,8 +977,15 @@ void ModuleBitcodeWriter::writeAttributeGroupTable() { Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum())); } else if (Attr.isIntAttribute()) { Record.push_back(1); - Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum())); - Record.push_back(Attr.getValueAsInt()); + Attribute::AttrKind Kind = Attr.getKindAsEnum(); + Record.push_back(getAttrKindEncoding(Kind)); + if (Kind == Attribute::Memory) { + // Version field for upgrading old memory effects. + const uint64_t Version = 1; + Record.push_back((Version << 56) | Attr.getValueAsInt()); + } else { + Record.push_back(Attr.getValueAsInt()); + } } else if (Attr.isStringAttribute()) { StringRef Kind = Attr.getKindAsString(); StringRef Val = Attr.getValueAsString(); diff --git a/llvm/lib/CodeGen/CFIFixup.cpp b/llvm/lib/CodeGen/CFIFixup.cpp index 7986f7d213454..eaef14574385a 100644 --- a/llvm/lib/CodeGen/CFIFixup.cpp +++ b/llvm/lib/CodeGen/CFIFixup.cpp @@ -80,6 +80,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCDwarf.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Target/TargetMachine.h" #include @@ -252,6 +253,11 @@ fixupBlock(MachineBasicBlock &CurrBB, const BlockFlagsVector &BlockInfo, if (!Info.Reachable) return false; + // If we don't need to perform full CFI fix up, we only need to fix up the + // first basic block in the section. + if (!TFL.enableFullCFIFixup(MF) && !CurrBB.isBeginSection()) + return false; + // If the previous block and the current block are in the same section, // the frame info will propagate from the previous block to the current one. const BlockFlags &PrevInfo = diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 88f863d8204d0..23ec3310079d3 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -50,6 +50,7 @@ add_llvm_component_library(LLVMCodeGen DeadMachineInstructionElim.cpp DetectDeadLanes.cpp DFAPacketizer.cpp + DroppedVariableStatsMIR.cpp DwarfEHPrepare.cpp EarlyIfConversion.cpp EdgeBundles.cpp diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index d69a24f00871e..35df2a479a545 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -94,7 +94,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeModuloScheduleTestPass(Registry); initializeMachinePostDominatorTreeWrapperPassPass(Registry); initializeMachineRegionInfoPassPass(Registry); - initializeMachineSchedulerPass(Registry); + initializeMachineSchedulerLegacyPass(Registry); initializeMachineSinkingPass(Registry); initializeMachineUniformityAnalysisPassPass(Registry); initializeMachineUniformityInfoPrinterPassPass(Registry); @@ -105,7 +105,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializePHIEliminationPass(Registry); initializePatchableFunctionPass(Registry); initializePeepholeOptimizerLegacyPass(Registry); - initializePostMachineSchedulerPass(Registry); + initializePostMachineSchedulerLegacyPass(Registry); initializePostRAHazardRecognizerPass(Registry); initializePostRAMachineSinkingPass(Registry); initializePostRASchedulerLegacyPass(Registry); diff --git a/llvm/lib/CodeGen/DroppedVariableStatsMIR.cpp b/llvm/lib/CodeGen/DroppedVariableStatsMIR.cpp new file mode 100644 index 0000000000000..9a1d4fb5d888a --- /dev/null +++ b/llvm/lib/CodeGen/DroppedVariableStatsMIR.cpp @@ -0,0 +1,95 @@ +///===- DroppedVariableStatsMIR.cpp ---------------------------------------===// +/// +/// Part of the LLVM Project, under the Apache License v2.0 with LLVM +/// Exceptions. See https://llvm.org/LICENSE.txt for license information. +/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +/// +///===---------------------------------------------------------------------===// +/// \file +/// Dropped Variable Statistics for Debug Information. Reports any number +/// of DBG_VALUEs that get dropped due to an optimization pass. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/CodeGen/DroppedVariableStatsMIR.h" +#include "llvm/IR/DebugInfoMetadata.h" + +using namespace llvm; + +void DroppedVariableStatsMIR::runBeforePass(StringRef PassID, + MachineFunction *MF) { + if (PassID == "Debug Variable Analysis") + return; + setup(); + return runOnMachineFunction(MF, true); +} + +void DroppedVariableStatsMIR::runAfterPass(StringRef PassID, + MachineFunction *MF) { + if (PassID == "Debug Variable Analysis") + return; + runOnMachineFunction(MF, false); + calculateDroppedVarStatsOnMachineFunction(MF, PassID, MF->getName().str()); + cleanup(); +} + +void DroppedVariableStatsMIR::runOnMachineFunction(const MachineFunction *MF, + bool Before) { + auto &DebugVariables = DebugVariablesStack.back()[&MF->getFunction()]; + auto FuncName = MF->getName(); + MFunc = MF; + run(DebugVariables, FuncName, Before); +} + +void DroppedVariableStatsMIR::calculateDroppedVarStatsOnMachineFunction( + const MachineFunction *MF, StringRef PassID, StringRef FuncOrModName) { + MFunc = MF; + StringRef FuncName = MF->getName(); + const Function *Func = &MF->getFunction(); + DebugVariables &DbgVariables = DebugVariablesStack.back()[Func]; + calculateDroppedStatsAndPrint(DbgVariables, FuncName, PassID, FuncOrModName, + "MachineFunction", Func); +} + +void DroppedVariableStatsMIR::visitEveryInstruction( + unsigned &DroppedCount, DenseMap &InlinedAtsMap, + VarID Var) { + unsigned PrevDroppedCount = DroppedCount; + const DIScope *DbgValScope = std::get<0>(Var); + for (const auto &MBB : *MFunc) { + for (const auto &MI : MBB) { + if (!MI.isDebugInstr()) { + auto *DbgLoc = MI.getDebugLoc().get(); + if (!DbgLoc) + continue; + + auto *Scope = DbgLoc->getScope(); + if (updateDroppedCount(DbgLoc, Scope, DbgValScope, InlinedAtsMap, Var, + DroppedCount)) + break; + } + } + if (PrevDroppedCount != DroppedCount) { + PrevDroppedCount = DroppedCount; + break; + } + } +} + +void DroppedVariableStatsMIR::visitEveryDebugRecord( + DenseSet &VarIDSet, + DenseMap> &InlinedAtsMap, + StringRef FuncName, bool Before) { + for (const auto &MBB : *MFunc) { + for (const auto &MI : MBB) { + if (MI.isDebugValueLike()) { + auto *DbgVar = MI.getDebugVariable(); + if (!DbgVar) + continue; + auto DbgLoc = MI.getDebugLoc(); + populateVarIDSetAndInlinedMap(DbgVar, DbgLoc, VarIDSet, InlinedAtsMap, + FuncName, Before); + } + } + } +} diff --git a/llvm/lib/CodeGen/MachineFunctionPass.cpp b/llvm/lib/CodeGen/MachineFunctionPass.cpp index 62ac3e32d24d9..a669877985821 100644 --- a/llvm/lib/CodeGen/MachineFunctionPass.cpp +++ b/llvm/lib/CodeGen/MachineFunctionPass.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/CodeGen/DroppedVariableStatsMIR.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" @@ -32,6 +33,11 @@ using namespace llvm; using namespace ore; +static cl::opt DroppedVarStatsMIR( + "dropped-variable-stats-mir", cl::Hidden, + cl::desc("Dump dropped debug variables stats for MIR passes"), + cl::init(false)); + Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O, const std::string &Banner) const { return createMachineFunctionPrinterPass(O, Banner); @@ -91,7 +97,16 @@ bool MachineFunctionPass::runOnFunction(Function &F) { MFProps.reset(ClearedProperties); - bool RV = runOnMachineFunction(MF); + bool RV; + if (DroppedVarStatsMIR) { + DroppedVariableStatsMIR DroppedVarStatsMF; + auto PassName = getPassName(); + DroppedVarStatsMF.runBeforePass(PassName, &MF); + RV = runOnMachineFunction(MF); + DroppedVarStatsMF.runAfterPass(PassName, &MF); + } else { + RV = runOnMachineFunction(MF); + } if (ShouldEmitSizeRemarks) { // We wanted size remarks. Check if there was a change to the number of diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 3f72e8486c06e..0da7535031a7d 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -214,69 +214,119 @@ MachineSchedContext::~MachineSchedContext() { delete RegClassInfo; } -namespace { - -/// Base class for a machine scheduler class that can run at any point. -class MachineSchedulerBase : public MachineSchedContext, - public MachineFunctionPass { -public: - MachineSchedulerBase(char &ID) : MachineFunctionPass(ID) {} +namespace llvm { +namespace impl_detail { +/// Base class for the machine scheduler classes. +class MachineSchedulerBase : public MachineSchedContext { protected: void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags); }; -/// MachineScheduler runs after coalescing and before register allocation. -class MachineScheduler : public MachineSchedulerBase { -public: - MachineScheduler(); +/// Impl class for MachineScheduler. +class MachineSchedulerImpl : public MachineSchedulerBase { + // These are only for using MF.verify() + // remove when verify supports passing in all analyses + MachineFunctionPass *P = nullptr; + MachineFunctionAnalysisManager *MFAM = nullptr; - void getAnalysisUsage(AnalysisUsage &AU) const override; +public: + struct RequiredAnalyses { + MachineLoopInfo &MLI; + MachineDominatorTree &MDT; + AAResults &AA; + LiveIntervals &LIS; + }; - bool runOnMachineFunction(MachineFunction&) override; + MachineSchedulerImpl() {} + // Migration only + void setLegacyPass(MachineFunctionPass *P) { this->P = P; } + void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; } - static char ID; // Class identification, replacement for typeinfo + bool run(MachineFunction &MF, const TargetMachine &TM, + const RequiredAnalyses &Analyses); protected: ScheduleDAGInstrs *createMachineScheduler(); }; -/// PostMachineScheduler runs after shortly before code emission. -class PostMachineScheduler : public MachineSchedulerBase { +/// Impl class for PostMachineScheduler. +class PostMachineSchedulerImpl : public MachineSchedulerBase { + // These are only for using MF.verify() + // remove when verify supports passing in all analyses + MachineFunctionPass *P = nullptr; + MachineFunctionAnalysisManager *MFAM = nullptr; + public: - PostMachineScheduler(); + struct RequiredAnalyses { + MachineLoopInfo &MLI; + AAResults &AA; + }; + PostMachineSchedulerImpl() {} + // Migration only + void setLegacyPass(MachineFunctionPass *P) { this->P = P; } + void setMFAM(MachineFunctionAnalysisManager *MFAM) { this->MFAM = MFAM; } - void getAnalysisUsage(AnalysisUsage &AU) const override; + bool run(MachineFunction &Func, const TargetMachine &TM, + const RequiredAnalyses &Analyses); + +protected: + ScheduleDAGInstrs *createPostMachineScheduler(); +}; +} // namespace impl_detail +} // namespace llvm + +using impl_detail::MachineSchedulerBase; +using impl_detail::MachineSchedulerImpl; +using impl_detail::PostMachineSchedulerImpl; + +namespace { +/// MachineScheduler runs after coalescing and before register allocation. +class MachineSchedulerLegacy : public MachineFunctionPass { + MachineSchedulerImpl Impl; + +public: + MachineSchedulerLegacy(); + void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnMachineFunction(MachineFunction&) override; static char ID; // Class identification, replacement for typeinfo +}; -protected: - ScheduleDAGInstrs *createPostMachineScheduler(); +/// PostMachineScheduler runs after shortly before code emission. +class PostMachineSchedulerLegacy : public MachineFunctionPass { + PostMachineSchedulerImpl Impl; + +public: + PostMachineSchedulerLegacy(); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &) override; + + static char ID; // Class identification, replacement for typeinfo }; } // end anonymous namespace -char MachineScheduler::ID = 0; +char MachineSchedulerLegacy::ID = 0; -char &llvm::MachineSchedulerID = MachineScheduler::ID; +char &llvm::MachineSchedulerID = MachineSchedulerLegacy::ID; -INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(MachineSchedulerLegacy, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) -INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE, +INITIALIZE_PASS_END(MachineSchedulerLegacy, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) -MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) { - initializeMachineSchedulerPass(*PassRegistry::getPassRegistry()); +MachineSchedulerLegacy::MachineSchedulerLegacy() : MachineFunctionPass(ID) { + initializeMachineSchedulerLegacyPass(*PassRegistry::getPassRegistry()); } -void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { +void MachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); @@ -289,23 +339,24 @@ void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -char PostMachineScheduler::ID = 0; +char PostMachineSchedulerLegacy::ID = 0; -char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID; +char &llvm::PostMachineSchedulerID = PostMachineSchedulerLegacy::ID; -INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched", +INITIALIZE_PASS_BEGIN(PostMachineSchedulerLegacy, "postmisched", "PostRA Machine Instruction Scheduler", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(PostMachineScheduler, "postmisched", +INITIALIZE_PASS_END(PostMachineSchedulerLegacy, "postmisched", "PostRA Machine Instruction Scheduler", false, false) -PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) { - initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry()); +PostMachineSchedulerLegacy::PostMachineSchedulerLegacy() + : MachineFunctionPass(ID) { + initializePostMachineSchedulerLegacyPass(*PassRegistry::getPassRegistry()); } -void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { +void PostMachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); @@ -385,17 +436,14 @@ nextIfDebug(MachineBasicBlock::iterator I, } /// Instantiate a ScheduleDAGInstrs that will be owned by the caller. -ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() { +ScheduleDAGInstrs *MachineSchedulerImpl::createMachineScheduler() { // Select the scheduler, or set the default. MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt; if (Ctor != useDefaultMachineSched) return Ctor(this); - const TargetMachine &TM = - getAnalysis().getTM(); - // Get the default scheduler set by the target for this function. - ScheduleDAGInstrs *Scheduler = TM.createMachineScheduler(this); + ScheduleDAGInstrs *Scheduler = TM->createMachineScheduler(this); if (Scheduler) return Scheduler; @@ -403,14 +451,47 @@ ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() { return createGenericSchedLive(this); } +bool MachineSchedulerImpl::run(MachineFunction &Func, const TargetMachine &TM, + const RequiredAnalyses &Analyses) { + MF = &Func; + MLI = &Analyses.MLI; + MDT = &Analyses.MDT; + this->TM = &TM; + AA = &Analyses.AA; + LIS = &Analyses.LIS; + + if (VerifyScheduling) { + LLVM_DEBUG(LIS->dump()); + const char *MSchedBanner = "Before machine scheduling."; + if (P) + MF->verify(P, MSchedBanner, &errs()); + else + MF->verify(*MFAM, MSchedBanner, &errs()); + } + RegClassInfo->runOnMachineFunction(*MF); + + // Instantiate the selected scheduler for this target, function, and + // optimization level. + std::unique_ptr Scheduler(createMachineScheduler()); + scheduleRegions(*Scheduler, false); + + LLVM_DEBUG(LIS->dump()); + if (VerifyScheduling) { + const char *MSchedBanner = "After machine scheduling."; + if (P) + MF->verify(P, MSchedBanner, &errs()); + else + MF->verify(*MFAM, MSchedBanner, &errs()); + } + return true; +} + /// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by /// the caller. We don't have a command line option to override the postRA /// scheduler. The Target must configure it. -ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() { - const TargetMachine &TM = - getAnalysis().getTM(); +ScheduleDAGInstrs *PostMachineSchedulerImpl::createPostMachineScheduler() { // Get the postRA scheduler set by the target for this function. - ScheduleDAGInstrs *Scheduler = TM.createPostMachineScheduler(this); + ScheduleDAGInstrs *Scheduler = TM->createPostMachineScheduler(this); if (Scheduler) return Scheduler; @@ -418,6 +499,37 @@ ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() { return createGenericSchedPostRA(this); } +bool PostMachineSchedulerImpl::run(MachineFunction &Func, + const TargetMachine &TM, + const RequiredAnalyses &Analyses) { + MF = &Func; + MLI = &Analyses.MLI; + this->TM = &TM; + AA = &Analyses.AA; + + if (VerifyScheduling) { + const char *PostMSchedBanner = "Before post machine scheduling."; + if (P) + MF->verify(P, PostMSchedBanner, &errs()); + else + MF->verify(*MFAM, PostMSchedBanner, &errs()); + } + + // Instantiate the selected scheduler for this target, function, and + // optimization level. + std::unique_ptr Scheduler(createPostMachineScheduler()); + scheduleRegions(*Scheduler, true); + + if (VerifyScheduling) { + const char *PostMSchedBanner = "After post machine scheduling."; + if (P) + MF->verify(P, PostMSchedBanner, &errs()); + else + MF->verify(*MFAM, PostMSchedBanner, &errs()); + } + return true; +} + /// Top-level MachineScheduler pass driver. /// /// Visit blocks in function order. Divide each block into scheduling regions @@ -434,72 +546,112 @@ ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() { /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler /// design would be to split blocks at scheduling boundaries, but LLVM has a /// general bias against block splitting purely for implementation simplicity. -bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { - if (skipFunction(mf.getFunction())) +bool MachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) return false; if (EnableMachineSched.getNumOccurrences()) { if (!EnableMachineSched) return false; - } else if (!mf.getSubtarget().enableMachineScheduler()) + } else if (!MF.getSubtarget().enableMachineScheduler()) { return false; - - LLVM_DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs())); - - // Initialize the context of the pass. - MF = &mf; - MLI = &getAnalysis().getLI(); - MDT = &getAnalysis().getDomTree(); - AA = &getAnalysis().getAAResults(); - - LIS = &getAnalysis().getLIS(); - - if (VerifyScheduling) { - LLVM_DEBUG(LIS->dump()); - MF->verify(this, "Before machine scheduling.", &errs()); } - RegClassInfo->runOnMachineFunction(*MF); - // Instantiate the selected scheduler for this target, function, and - // optimization level. - std::unique_ptr Scheduler(createMachineScheduler()); - scheduleRegions(*Scheduler, false); + LLVM_DEBUG(dbgs() << "Before MISched:\n"; MF.print(dbgs())); - LLVM_DEBUG(LIS->dump()); - if (VerifyScheduling) - MF->verify(this, "After machine scheduling.", &errs()); - return true; + auto &MLI = getAnalysis().getLI(); + auto &MDT = getAnalysis().getDomTree(); + auto &TM = getAnalysis().getTM(); + auto &AA = getAnalysis().getAAResults(); + auto &LIS = getAnalysis().getLIS(); + Impl.setLegacyPass(this); + return Impl.run(MF, TM, {MLI, MDT, AA, LIS}); +} + +MachineSchedulerPass::MachineSchedulerPass(const TargetMachine *TM) + : Impl(std::make_unique()), TM(TM) {} +MachineSchedulerPass::~MachineSchedulerPass() = default; +MachineSchedulerPass::MachineSchedulerPass(MachineSchedulerPass &&Other) = + default; + +PostMachineSchedulerPass::PostMachineSchedulerPass(const TargetMachine *TM) + : Impl(std::make_unique()), TM(TM) {} +PostMachineSchedulerPass::PostMachineSchedulerPass( + PostMachineSchedulerPass &&Other) = default; +PostMachineSchedulerPass::~PostMachineSchedulerPass() = default; + +PreservedAnalyses +MachineSchedulerPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (EnableMachineSched.getNumOccurrences()) { + if (!EnableMachineSched) + return PreservedAnalyses::all(); + } else if (!MF.getSubtarget().enableMachineScheduler()) { + return PreservedAnalyses::all(); + } + + LLVM_DEBUG(dbgs() << "Before MISched:\n"; MF.print(dbgs())); + auto &MLI = MFAM.getResult(MF); + auto &MDT = MFAM.getResult(MF); + auto &FAM = MFAM.getResult(MF) + .getManager(); + auto &AA = FAM.getResult(MF.getFunction()); + auto &LIS = MFAM.getResult(MF); + Impl->setMFAM(&MFAM); + bool Changed = Impl->run(MF, *TM, {MLI, MDT, AA, LIS}); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet(); + PA.preserve(); + PA.preserve(); + return PA; } -bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { - if (skipFunction(mf.getFunction())) +bool PostMachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) return false; if (EnablePostRAMachineSched.getNumOccurrences()) { if (!EnablePostRAMachineSched) return false; - } else if (!mf.getSubtarget().enablePostRAMachineScheduler()) { + } else if (!MF.getSubtarget().enablePostRAMachineScheduler()) { LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n"); return false; } - LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs())); - - // Initialize the context of the pass. - MF = &mf; - MLI = &getAnalysis().getLI(); - AA = &getAnalysis().getAAResults(); - - if (VerifyScheduling) - MF->verify(this, "Before post machine scheduling.", &errs()); - - // Instantiate the selected scheduler for this target, function, and - // optimization level. - std::unique_ptr Scheduler(createPostMachineScheduler()); - scheduleRegions(*Scheduler, true); + LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; MF.print(dbgs())); + auto &MLI = getAnalysis().getLI(); + auto &TM = getAnalysis().getTM(); + auto &AA = getAnalysis().getAAResults(); + Impl.setLegacyPass(this); + return Impl.run(MF, TM, {MLI, AA}); +} - if (VerifyScheduling) - MF->verify(this, "After post machine scheduling.", &errs()); - return true; +PreservedAnalyses +PostMachineSchedulerPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (EnablePostRAMachineSched.getNumOccurrences()) { + if (!EnablePostRAMachineSched) + return PreservedAnalyses::all(); + } else if (!MF.getSubtarget().enablePostRAMachineScheduler()) { + LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n"); + return PreservedAnalyses::all(); + } + LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; MF.print(dbgs())); + auto &MLI = MFAM.getResult(MF); + auto &FAM = MFAM.getResult(MF) + .getManager(); + auto &AA = FAM.getResult(MF.getFunction()); + + Impl->setMFAM(&MFAM); + bool Changed = Impl->run(MF, *TM, {MLI, AA}); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet(); + return PA; } /// Return true of the given instruction should not be included in a scheduling @@ -3097,6 +3249,7 @@ void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA, #ifndef NDEBUG const char *GenericSchedulerBase::getReasonStr( GenericSchedulerBase::CandReason Reason) { + // clang-format off switch (Reason) { case NoCand: return "NOCAND "; case Only1: return "ONLY1 "; @@ -3113,9 +3266,9 @@ const char *GenericSchedulerBase::getReasonStr( case TopPathReduce: return "TOP-PATH "; case BotHeightReduce:return "BOT-HEIGHT"; case BotPathReduce: return "BOT-PATH "; - case NextDefUse: return "DEF-USE "; case NodeOrder: return "ORDER "; }; + // clang-format on llvm_unreachable("Unknown reason!"); } diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index e1f05406297d2..51e047b2fa3f0 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -135,7 +135,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveDebugVariablesWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(RegisterCoalescerLegacy) -INITIALIZE_PASS_DEPENDENCY(MachineScheduler) +INITIALIZE_PASS_DEPENDENCY(MachineSchedulerLegacy) INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 465c4e8feffbb..2e43ad78e5d9b 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -155,7 +155,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveDebugVariablesWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(RegisterCoalescerLegacy) -INITIALIZE_PASS_DEPENDENCY(MachineScheduler) +INITIALIZE_PASS_DEPENDENCY(MachineSchedulerLegacy) INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ef58da873c59c..c6fd72b6b76f4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9691,7 +9691,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { } // fold (not (add X, -1)) -> (neg X) - if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::ADD && + if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && isAllOnesConstant(N1) && isAllOnesOrAllOnesSplat(N0.getOperand(1))) { return DAG.getNegative(N0.getOperand(0), DL, VT); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index f13f70e66cfaa..b58c160b5c8b8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -880,6 +880,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo); SDValue ScalarizeVecOp_BITCAST(SDNode *N); SDValue ScalarizeVecOp_UnaryOp(SDNode *N); + SDValue ScalarizeVecOp_UnaryOpWithExtraInput(SDNode *N); SDValue ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N); SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N); SDValue ScalarizeVecOp_INSERT_SUBVECTOR(SDNode *N, unsigned OpNo); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 915ee2d110332..1d8bf5427156e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -775,6 +775,10 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) { case ISD::LLRINT: Res = ScalarizeVecOp_UnaryOp(N); break; + case ISD::FP_TO_SINT_SAT: + case ISD::FP_TO_UINT_SAT: + Res = ScalarizeVecOp_UnaryOpWithExtraInput(N); + break; case ISD::STRICT_SINT_TO_FP: case ISD::STRICT_UINT_TO_FP: case ISD::STRICT_FP_TO_SINT: @@ -882,6 +886,20 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) { return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op); } +/// Same as ScalarizeVecOp_UnaryOp with an extra operand (for example a +/// typesize). +SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOpWithExtraInput(SDNode *N) { + assert(N->getValueType(0).getVectorNumElements() == 1 && + "Unexpected vector type!"); + SDValue Elt = GetScalarizedVector(N->getOperand(0)); + SDValue Op = + DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0).getScalarType(), + Elt, N->getOperand(1)); + // Revectorize the result so the types line up with what the uses of this + // expression expect. + return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op); +} + /// If the input is a vector that needs to be scalarized, it must be <1 x ty>. /// Do the strict FP operation on the element instead. SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 16c3b295426c6..9d2f87497d6fa 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8015,17 +8015,8 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG, if (Slice.Array == nullptr) { if (VT.isInteger()) return DAG.getConstant(0, dl, VT); - if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128) - return DAG.getConstantFP(0.0, dl, VT); - if (VT.isVector()) { - unsigned NumElts = VT.getVectorNumElements(); - MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getConstant(0, dl, - EVT::getVectorVT(*DAG.getContext(), - EltVT, NumElts))); - } - llvm_unreachable("Expected type!"); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getConstant(0, dl, VT.changeTypeToInteger())); } assert(!VT.isVector() && "Can't handle vector type here!"); @@ -13415,10 +13406,10 @@ BuildVectorSDNode::isConstantSequence() const { return std::make_pair(Start, Stride); } -bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) { +bool ShuffleVectorSDNode::isSplatMask(ArrayRef Mask) { // Find the first non-undef value in the shuffle mask. unsigned i, e; - for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i) + for (i = 0, e = Mask.size(); i != e && Mask[i] < 0; ++i) /* search */; // If all elements are undefined, this shuffle can be considered a splat diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 5a5596a542f72..cac25fd7c1025 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1855,7 +1855,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { if (isa(CDS->getType())) return DAG.getMergeValues(Ops, getCurSDLoc()); - return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops); + return DAG.getBuildVector(VT, getCurSDLoc(), Ops); } if (C->getType()->isStructTy() || C->getType()->isArrayTy()) { @@ -1898,14 +1898,13 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { if (VT.isRISCVVectorTuple()) { assert(C->isNullValue() && "Can only zero this target type!"); - return NodeMap[V] = DAG.getNode( - ISD::BITCAST, getCurSDLoc(), VT, - DAG.getNode( - ISD::SPLAT_VECTOR, getCurSDLoc(), - EVT::getVectorVT(*DAG.getContext(), MVT::i8, - VT.getSizeInBits().getKnownMinValue() / 8, - true), - DAG.getConstant(0, getCurSDLoc(), MVT::getIntegerVT(8)))); + return DAG.getNode( + ISD::BITCAST, getCurSDLoc(), VT, + DAG.getNode( + ISD::SPLAT_VECTOR, getCurSDLoc(), + EVT::getVectorVT(*DAG.getContext(), MVT::i8, + VT.getSizeInBits().getKnownMinValue() / 8, true), + DAG.getConstant(0, getCurSDLoc(), MVT::getIntegerVT(8)))); } VectorType *VecTy = cast(V->getType()); @@ -1918,7 +1917,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { for (unsigned i = 0; i != NumElements; ++i) Ops.push_back(getValue(CV->getOperand(i))); - return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops); + return DAG.getBuildVector(VT, getCurSDLoc(), Ops); } if (isa(C)) { @@ -1931,7 +1930,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { else Op = DAG.getConstant(0, getCurSDLoc(), EltVT); - return NodeMap[V] = DAG.getSplat(VT, getCurSDLoc(), Op); + return DAG.getSplat(VT, getCurSDLoc(), Op); } llvm_unreachable("Unknown vector constant"); diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index 08e86c705786c..5784974cd8ed9 100644 --- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -35,7 +35,7 @@ bool TargetFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const return false; } -bool TargetFrameLowering::enableCFIFixup(MachineFunction &MF) const { +bool TargetFrameLowering::enableCFIFixup(const MachineFunction &MF) const { return MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); } diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 6cbc4b9776a1b..9f44f8b1c0f56 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -758,7 +758,7 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName, if (!SupportsUnique) { Flags &= ~ELF::SHF_MERGE; EntrySize = 0; - return MCContext::GenericSectionID; + return MCSection::NonUniqueID; } const bool SymbolMergeable = Flags & ELF::SHF_MERGE; @@ -770,7 +770,7 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName, if (TM.getSeparateNamedSections()) return NextUniqueID++; else - return MCContext::GenericSectionID; + return MCSection::NonUniqueID; } // Symbols must be placed into sections with compatible entry sizes. Generate @@ -778,8 +778,8 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName, // sections. const auto PreviousID = Ctx.getELFUniqueIDForEntsize(SectionName, Flags, EntrySize); - if (PreviousID && (!TM.getSeparateNamedSections() || - *PreviousID == MCContext::GenericSectionID)) + if (PreviousID && + (!TM.getSeparateNamedSections() || *PreviousID == MCSection::NonUniqueID)) return *PreviousID; // If the user has specified the same section name as would be created @@ -791,7 +791,7 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName, if (SymbolMergeable && Ctx.isELFImplicitMergeableSectionNamePrefix(SectionName) && SectionName.starts_with(ImplicitSectionNameStem)) - return MCContext::GenericSectionID; + return MCSection::NonUniqueID; // We have seen this section name before, but with different flags or entity // size. Create a new unique ID. @@ -903,7 +903,7 @@ static MCSectionELF *selectELFSectionForGlobal( unsigned EntrySize = getEntrySizeForKind(Kind); bool UniqueSectionName = false; - unsigned UniqueID = MCContext::GenericSectionID; + unsigned UniqueID = MCSection::NonUniqueID; if (EmitUniqueSection) { if (TM.getUniqueSectionNames()) { UniqueSectionName = true; @@ -1073,7 +1073,7 @@ MCSection *TargetLoweringObjectFileELF::getSectionForMachineBasicBlock( const Function &F, const MachineBasicBlock &MBB, const TargetMachine &TM) const { assert(MBB.isBeginSection() && "Basic block does not start a section!"); - unsigned UniqueID = MCContext::GenericSectionID; + unsigned UniqueID = MCSection::NonUniqueID; // For cold sections use the .text.split. prefix along with the parent // function name. All cold blocks for the same function go to the same @@ -1774,7 +1774,7 @@ MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal( else ComdatGV = GO; - unsigned UniqueID = MCContext::GenericSectionID; + unsigned UniqueID = MCSection::NonUniqueID; if (EmitUniquedSection) UniqueID = NextUniqueID++; @@ -2220,8 +2220,8 @@ MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal( } unsigned Flags = getWasmSectionFlags(Kind, Used.count(GO)); - MCSectionWasm *Section = getContext().getWasmSection( - Name, Kind, Flags, Group, MCContext::GenericSectionID); + MCSectionWasm *Section = getContext().getWasmSection(Name, Kind, Flags, Group, + MCSection::NonUniqueID); return Section; } @@ -2249,7 +2249,7 @@ selectWasmSectionForGlobal(MCContext &Ctx, const GlobalObject *GO, Name.push_back('.'); TM.getNameWithPrefix(Name, GO, Mang, true); } - unsigned UniqueID = MCContext::GenericSectionID; + unsigned UniqueID = MCSection::NonUniqueID; if (EmitUniqueSection && !UniqueSectionNames) { UniqueID = *NextUniqueID; (*NextUniqueID)++; diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp index 7b38150ab4b65..45cb28af56050 100644 --- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp @@ -54,9 +54,8 @@ SelfExecutorProcessControl::SelfExecutorProcessControl( // FIXME: Don't add an UnwindInfoManager by default -- it's redundant when // the ORC runtime is loaded. We'll need a way to document this and // allow clients to choose. - this->UnwindInfoMgr = UnwindInfoManager::TryCreate(); - if (this->UnwindInfoMgr) - this->UnwindInfoMgr->addBootstrapSymbols(this->BootstrapSymbols); + if (UnwindInfoManager::TryEnable()) + UnwindInfoManager::addBootstrapSymbols(this->BootstrapSymbols); #endif // __APPLE__ } diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index dd844ae3a42bc..972c24abc7506 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -1241,8 +1241,8 @@ Expected setUpGenericLLVMIRPlatform(LLJIT &J) { // If UseEHFrames hasn't been set then we're good to use compact-unwind. if (!UseEHFrames) { - if (auto UIRP = UnwindInfoRegistrationPlugin::Create( - J.getIRCompileLayer(), PlatformJD)) { + if (auto UIRP = + UnwindInfoRegistrationPlugin::Create(J.getExecutionSession())) { OLL->addPlugin(std::move(*UIRP)); LLVM_DEBUG(dbgs() << "Enabled compact-unwind support.\n"); } else diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp index fef3ff989a52a..d3b3f121cfcd9 100644 --- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp @@ -65,14 +65,6 @@ const char *RunAsIntFunctionWrapperName = } // end namespace rt namespace rt_alt { -const char *UnwindInfoManagerInstanceName = - "orc_rt_alt_UnwindInfoManager_Instance"; -const char *UnwindInfoManagerFindSectionsHelperName = - "orc_rt_alt_UnwindInfoManager_findSectionsHelper"; -const char *UnwindInfoManagerEnableWrapperName = - "orc_rt_alt_UnwindInfoManager_enable"; -const char *UnwindInfoManagerDisableWrapperName = - "orc_rt_alt_UnwindInfoManager_disable"; const char *UnwindInfoManagerRegisterActionName = "orc_rt_alt_UnwindInfoManager_register"; const char *UnwindInfoManagerDeregisterActionName = diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp index 9f748154c03e5..7510079f87b84 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.cpp @@ -9,7 +9,10 @@ #include "llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h" #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h" #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h" -#include "llvm/Support/DynamicLibrary.h" + +#ifdef __APPLE__ +#include +#endif // __APPLE__ #define DEBUG_TYPE "orc" @@ -17,40 +20,17 @@ using namespace llvm; using namespace llvm::orc; using namespace llvm::orc::shared; -static orc::shared::CWrapperFunctionResult -llvm_orc_rt_alt_UnwindInfoManager_enable(const char *Data, uint64_t Size) { - return WrapperFunction::handle( - Data, Size, - [](ExecutorAddr Instance, ExecutorAddr FindFn) { - return Instance.toPtr()->enable( - FindFn.toPtr()); - }) - .release(); -} - -static orc::shared::CWrapperFunctionResult -llvm_orc_rt_alt_UnwindInfoManager_disable(const char *Data, uint64_t Size) { - return WrapperFunction::handle( - Data, Size, - [](ExecutorAddr Instance) { - return Instance.toPtr()->disable(); - }) - .release(); -} - static orc::shared::CWrapperFunctionResult llvm_orc_rt_alt_UnwindInfoManager_register(const char *Data, uint64_t Size) { - using SPSSig = - SPSError(SPSExecutorAddr, SPSSequence, - SPSExecutorAddr, SPSExecutorAddrRange, SPSExecutorAddrRange); + using SPSSig = SPSError(SPSSequence, SPSExecutorAddr, + SPSExecutorAddrRange, SPSExecutorAddrRange); return WrapperFunction::handle( Data, Size, - [](ExecutorAddr Instance, - std::vector CodeRanges, ExecutorAddr DSOBase, + [](std::vector CodeRanges, ExecutorAddr DSOBase, ExecutorAddrRange DWARFRange, ExecutorAddrRange CompactUnwindRange) { - return Instance.toPtr()->registerSections( + return UnwindInfoManager::registerSections( CodeRanges, DSOBase, DWARFRange, CompactUnwindRange); }) .release(); @@ -58,89 +38,105 @@ llvm_orc_rt_alt_UnwindInfoManager_register(const char *Data, uint64_t Size) { static orc::shared::CWrapperFunctionResult llvm_orc_rt_alt_UnwindInfoManager_deregister(const char *Data, uint64_t Size) { - using SPSSig = SPSError(SPSExecutorAddr, SPSSequence); + using SPSSig = SPSError(SPSSequence); return WrapperFunction::handle( Data, Size, - [](ExecutorAddr Instance, - std::vector CodeRanges) { - return Instance.toPtr()->deregisterSections( - CodeRanges); + [](std::vector CodeRanges) { + return UnwindInfoManager::deregisterSections(CodeRanges); }) .release(); } namespace llvm::orc { -const char *UnwindInfoManager::AddFnName = +[[maybe_unused]] static const char *AddFnName = "__unw_add_find_dynamic_unwind_sections"; -const char *UnwindInfoManager::RemoveFnName = +[[maybe_unused]] static const char *RemoveFnName = "__unw_remove_find_dynamic_unwind_sections"; +static std::unique_ptr Instance; +static int (*RemoveFindDynamicUnwindSections)(void *) = nullptr; + +UnwindInfoManager::~UnwindInfoManager() { + if (int Err = RemoveFindDynamicUnwindSections((void *)&findSections)) { + (void)Err; // Silence unused variable warning in release builds. + LLVM_DEBUG({ + dbgs() << "Failed call to " << RemoveFnName << ": error = " << Err + << "\n"; + }); + (void)Err; + } +} -std::unique_ptr UnwindInfoManager::TryCreate() { - std::string ErrMsg; - auto DL = sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg); - if (!DL.isValid()) - return nullptr; +bool UnwindInfoManager::TryEnable() { +#ifdef __APPLE__ + static std::mutex M; + std::lock_guard Lock(M); - auto AddFindDynamicUnwindSections = - (int (*)(void *))DL.getAddressOfSymbol(AddFnName); - if (!AddFindDynamicUnwindSections) - return nullptr; + if (Instance) + return true; - auto RemoveFindDynamicUnwindSections = - (int (*)(void *))DL.getAddressOfSymbol(RemoveFnName); - if (!RemoveFindDynamicUnwindSections) - return nullptr; + auto AddFn = (int (*)(void *))dlsym(RTLD_DEFAULT, AddFnName); + if (!AddFn) + return false; - return std::unique_ptr(new UnwindInfoManager( - AddFindDynamicUnwindSections, RemoveFindDynamicUnwindSections)); -} + auto RemoveFn = (int (*)(void *))dlsym(RTLD_DEFAULT, RemoveFnName); + if (!RemoveFn) + return false; -Error UnwindInfoManager::shutdown() { return Error::success(); } + Instance.reset(new UnwindInfoManager()); + + if (auto Err = AddFn((void *)&findSections)) { + (void)Err; // Silence unused variable warning in release builds. + LLVM_DEBUG({ + dbgs() << "Failed call to " << AddFnName << ": error = " << Err << "\n"; + }); + Instance = nullptr; + return false; + } + + RemoveFindDynamicUnwindSections = RemoveFn; + return true; + +#else + return false; +#endif // __APPLE__ +} void UnwindInfoManager::addBootstrapSymbols(StringMap &M) { - M[rt_alt::UnwindInfoManagerInstanceName] = ExecutorAddr::fromPtr(this); - M[rt_alt::UnwindInfoManagerFindSectionsHelperName] = - ExecutorAddr::fromPtr(&findSectionsHelper); - M[rt_alt::UnwindInfoManagerEnableWrapperName] = - ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_enable); - M[rt_alt::UnwindInfoManagerDisableWrapperName] = - ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_disable); M[rt_alt::UnwindInfoManagerRegisterActionName] = ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_register); M[rt_alt::UnwindInfoManagerDeregisterActionName] = ExecutorAddr::fromPtr(llvm_orc_rt_alt_UnwindInfoManager_deregister); } -Error UnwindInfoManager::enable(void *FindDynamicUnwindSections) { - LLVM_DEBUG(dbgs() << "Enabling UnwindInfoManager.\n"); - - if (auto Err = AddFindDynamicUnwindSections(FindDynamicUnwindSections)) - return make_error(Twine("Could not register function via ") + - AddFnName + - ", error code = " + Twine(Err), - inconvertibleErrorCode()); - - this->FindDynamicUnwindSections = FindDynamicUnwindSections; - return Error::success(); +Error UnwindInfoManager::registerSections( + ArrayRef CodeRanges, orc::ExecutorAddr DSOBase, + orc::ExecutorAddrRange DWARFEHFrame, orc::ExecutorAddrRange CompactUnwind) { + return Instance->registerSectionsImpl(CodeRanges, DSOBase, DWARFEHFrame, + CompactUnwind); } -Error UnwindInfoManager::disable(void) { - LLVM_DEBUG(dbgs() << "Disabling UnwindInfoManager.\n"); +Error UnwindInfoManager::deregisterSections( + ArrayRef CodeRanges) { + return Instance->deregisterSectionsImpl(CodeRanges); +} - if (FindDynamicUnwindSections) - if (auto Err = RemoveFindDynamicUnwindSections(FindDynamicUnwindSections)) - return make_error( - Twine("Could not deregister function via ") + RemoveFnName + - "error code = " + Twine(Err), - inconvertibleErrorCode()); +int UnwindInfoManager::findSectionsImpl(uintptr_t Addr, UnwindSections *Info) { + std::lock_guard Lock(M); + auto I = UWSecs.upper_bound(Addr); + if (I == UWSecs.begin()) + return 0; + --I; + *Info = I->second; + return 1; +} - FindDynamicUnwindSections = nullptr; - return Error::success(); +int UnwindInfoManager::findSections(uintptr_t Addr, UnwindSections *Info) { + return Instance->findSectionsImpl(Addr, Info); } -Error UnwindInfoManager::registerSections( +Error UnwindInfoManager::registerSectionsImpl( ArrayRef CodeRanges, ExecutorAddr DSOBase, ExecutorAddrRange DWARFEHFrame, ExecutorAddrRange CompactUnwind) { std::lock_guard Lock(M); @@ -154,7 +150,7 @@ Error UnwindInfoManager::registerSections( return Error::success(); } -Error UnwindInfoManager::deregisterSections( +Error UnwindInfoManager::deregisterSectionsImpl( ArrayRef CodeRanges) { std::lock_guard Lock(M); for (auto &R : CodeRanges) { @@ -169,20 +165,4 @@ Error UnwindInfoManager::deregisterSections( return Error::success(); } -int UnwindInfoManager::findSections(uintptr_t Addr, UnwindSections *Info) { - std::lock_guard Lock(M); - auto I = UWSecs.upper_bound(Addr); - if (I == UWSecs.begin()) - return 0; - --I; - *Info = I->second; - return 1; -} - -int UnwindInfoManager::findSectionsHelper(UnwindInfoManager *Instance, - uintptr_t Addr, - UnwindSections *Info) { - return Instance->findSections(Addr, Info); -} - } // namespace llvm::orc diff --git a/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp index ae1f3f98269db..4482eedc00702 100644 --- a/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.cpp @@ -9,7 +9,6 @@ #include "llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h" #include "llvm/ADT/ScopeExit.h" -#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/Shared/MachOObjectFormat.h" #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h" #include "llvm/IR/IRBuilder.h" @@ -19,95 +18,21 @@ using namespace llvm::jitlink; -static const char *FindDynamicUnwindSectionsFunctionName = - "_orc_rt_alt_find_dynamic_unwind_sections"; - namespace llvm::orc { Expected> -UnwindInfoRegistrationPlugin::Create(IRLayer &IRL, JITDylib &PlatformJD, - ExecutorAddr Instance, - ExecutorAddr FindHelper, - ExecutorAddr Enable, ExecutorAddr Disable, - ExecutorAddr Register, - ExecutorAddr Deregister) { - - auto &ES = IRL.getExecutionSession(); - - // Build bouncer module. - auto M = makeBouncerModule(ES); - if (!M) - return M.takeError(); - - auto BouncerRT = PlatformJD.createResourceTracker(); - auto RemoveBouncerModule = make_scope_exit([&]() { - if (auto Err = BouncerRT->remove()) - ES.reportError(std::move(Err)); - }); - - if (auto Err = PlatformJD.define(absoluteSymbols( - {{ES.intern(rt_alt::UnwindInfoManagerInstanceName), - ExecutorSymbolDef(Instance, JITSymbolFlags())}, - {ES.intern(rt_alt::UnwindInfoManagerFindSectionsHelperName), - ExecutorSymbolDef(FindHelper, JITSymbolFlags::Callable)}}))) - return std::move(Err); - - if (auto Err = IRL.add(BouncerRT, std::move(*M))) - return Err; - - auto FindUnwindSections = - ES.lookup({&PlatformJD}, FindDynamicUnwindSectionsFunctionName); - if (!FindUnwindSections) - return FindUnwindSections.takeError(); - - using namespace shared; - using SPSEnableSig = SPSError(SPSExecutorAddr, SPSExecutorAddr); - Error CallErr = Error::success(); - if (auto Err = ES.callSPSWrapper( - Enable, CallErr, Instance, FindUnwindSections->getAddress())) { - consumeError(std::move(CallErr)); - return std::move(Err); - } - - if (CallErr) - return std::move(CallErr); - - RemoveBouncerModule.release(); - - return std::shared_ptr( - new UnwindInfoRegistrationPlugin(ES, Instance, Disable, Register, - Deregister)); -} - -Expected> -UnwindInfoRegistrationPlugin::Create(IRLayer &IRL, JITDylib &PlatformJD) { +UnwindInfoRegistrationPlugin::Create(ExecutionSession &ES) { - ExecutorAddr Instance, FindHelper, Enable, Disable, Register, Deregister; + ExecutorAddr Register, Deregister; - auto &EPC = IRL.getExecutionSession().getExecutorProcessControl(); + auto &EPC = ES.getExecutorProcessControl(); if (auto Err = EPC.getBootstrapSymbols( - {{Instance, rt_alt::UnwindInfoManagerInstanceName}, - {FindHelper, rt_alt::UnwindInfoManagerFindSectionsHelperName}, - {Enable, rt_alt::UnwindInfoManagerEnableWrapperName}, - {Disable, rt_alt::UnwindInfoManagerDisableWrapperName}, - {Register, rt_alt::UnwindInfoManagerRegisterActionName}, + {{Register, rt_alt::UnwindInfoManagerRegisterActionName}, {Deregister, rt_alt::UnwindInfoManagerDeregisterActionName}})) return std::move(Err); - return Create(IRL, PlatformJD, Instance, FindHelper, Enable, Disable, - Register, Deregister); -} - -UnwindInfoRegistrationPlugin::~UnwindInfoRegistrationPlugin() { - using namespace shared; - using SPSDisableSig = SPSError(SPSExecutorAddr); - Error CallErr = Error::success(); - if (auto Err = ES.callSPSWrapper(Disable, CallErr, Instance)) { - consumeError(std::move(CallErr)); - ES.reportError(std::move(Err)); - } - if (CallErr) - ES.reportError(std::move(CallErr)); + return std::make_shared(ES, Register, + Deregister); } void UnwindInfoRegistrationPlugin::modifyPassConfig( @@ -118,43 +43,6 @@ void UnwindInfoRegistrationPlugin::modifyPassConfig( [this](LinkGraph &G) { return addUnwindInfoRegistrationActions(G); }); } -Expected -UnwindInfoRegistrationPlugin::makeBouncerModule(ExecutionSession &ES) { - auto Ctx = std::make_unique(); - auto M = std::make_unique("__libunwind_find_unwind_bouncer", *Ctx); - M->setTargetTriple(ES.getTargetTriple().str()); - - auto EscapeName = [](const char *N) { return std::string("\01") + N; }; - - auto *PtrTy = PointerType::getUnqual(*Ctx); - auto *OpaqueStructTy = StructType::create(*Ctx, "UnwindInfoMgr"); - auto *UnwindMgrInstance = new GlobalVariable( - *M, OpaqueStructTy, true, GlobalValue::ExternalLinkage, nullptr, - EscapeName(rt_alt::UnwindInfoManagerInstanceName)); - - auto *Int64Ty = Type::getInt64Ty(*Ctx); - auto *FindHelperTy = FunctionType::get(Int64Ty, {PtrTy, PtrTy, PtrTy}, false); - auto *FindHelperFn = Function::Create( - FindHelperTy, GlobalValue::ExternalLinkage, - EscapeName(rt_alt::UnwindInfoManagerFindSectionsHelperName), *M); - - auto *FindFnTy = FunctionType::get(Int64Ty, {PtrTy, PtrTy}, false); - auto *FindFn = - Function::Create(FindFnTy, GlobalValue::ExternalLinkage, - EscapeName(FindDynamicUnwindSectionsFunctionName), *M); - auto *EntryBlock = BasicBlock::Create(M->getContext(), StringRef(), FindFn); - IRBuilder<> IB(EntryBlock); - - std::vector FindHelperArgs; - FindHelperArgs.push_back(UnwindMgrInstance); - for (auto &Arg : FindFn->args()) - FindHelperArgs.push_back(&Arg); - - IB.CreateRet(IB.CreateCall(FindHelperFn, FindHelperArgs)); - - return ThreadSafeModule(std::move(M), std::move(Ctx)); -} - Error UnwindInfoRegistrationPlugin::addUnwindInfoRegistrationActions( LinkGraph &G) { ExecutorAddrRange EHFrameRange, UnwindInfoRange; @@ -220,17 +108,15 @@ Error UnwindInfoRegistrationPlugin::addUnwindInfoRegistrationActions( using namespace shared; using SPSRegisterArgs = - SPSArgList, - SPSExecutorAddr, SPSExecutorAddrRange, SPSExecutorAddrRange>; - using SPSDeregisterArgs = - SPSArgList>; + SPSArgList, SPSExecutorAddr, + SPSExecutorAddrRange, SPSExecutorAddrRange>; + using SPSDeregisterArgs = SPSArgList>; G.allocActions().push_back( {cantFail(WrapperFunctionCall::Create( - Register, Instance, CodeRanges, DSOBase, EHFrameRange, - UnwindInfoRange)), - cantFail(WrapperFunctionCall::Create( - Deregister, Instance, CodeRanges))}); + Register, CodeRanges, DSOBase, EHFrameRange, UnwindInfoRange)), + cantFail(WrapperFunctionCall::Create(Deregister, + CodeRanges))}); return Error::success(); } diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp index 5e13da172d677..2edfd786c5c23 100644 --- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp @@ -52,6 +52,7 @@ OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple, case Triple::amdgcn: case Triple::nvptx: case Triple::nvptx64: + case Triple::spirv64: ActiveTraits.set(unsigned(TraitProperty::target_device_kind_gpu)); break; default: @@ -98,6 +99,7 @@ OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple, case Triple::amdgcn: case Triple::nvptx: case Triple::nvptx64: + case Triple::spirv64: ActiveTraits.set(unsigned(TraitProperty::device_kind_gpu)); ActiveTraits.set(unsigned(TraitProperty::target_device_kind_gpu)); break; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index ea8fab94a2256..04acab1e5765e 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -6458,7 +6458,7 @@ void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB) { if (T.isNVPTX()) if (UB > 0) - updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true); + Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB)); if (T.isAMDGPU()) Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1"); diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index ef0591ef31744..8da1dfe914818 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -647,6 +647,9 @@ std::string Attribute::getAsString(bool InAttrGrp) const { case IRMemLocation::InaccessibleMem: OS << "inaccessiblemem: "; break; + case IRMemLocation::ErrnoMem: + OS << "errnomem: "; + break; case IRMemLocation::Other: llvm_unreachable("This is represented as the default access kind"); } diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index e886a6012b219..57072715366c9 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/AutoUpgrade.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/BinaryFormat/Dwarf.h" @@ -5043,6 +5044,21 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K, Idx, Attribute::getWithStackAlignment(GV->getContext(), StackAlign)); return true; } + if (K == "maxclusterrank" || K == "cluster_max_blocks") { + const auto CV = mdconst::extract(V)->getZExtValue(); + cast(GV)->addFnAttr("nvvm.maxclusterrank", llvm::utostr(CV)); + return true; + } + if (K == "minctasm") { + const auto CV = mdconst::extract(V)->getZExtValue(); + cast(GV)->addFnAttr("nvvm.minctasm", llvm::utostr(CV)); + return true; + } + if (K == "maxnreg") { + const auto CV = mdconst::extract(V)->getZExtValue(); + cast(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV)); + return true; + } return false; } diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt index 5f6254b231318..eb00829fd8c70 100644 --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -26,6 +26,8 @@ add_llvm_component_library(LLVMCore DiagnosticInfo.cpp DiagnosticPrinter.cpp Dominators.cpp + DroppedVariableStats.cpp + DroppedVariableStatsIR.cpp EHPersonalities.cpp FPEnv.cpp Function.cpp diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index 33f4dc78c6d3f..9b69b1cb059da 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -2395,10 +2395,10 @@ bool ConstantExpr::isDesirableBinOp(unsigned Opcode) { case Instruction::LShr: case Instruction::AShr: case Instruction::Shl: + case Instruction::Mul: return false; case Instruction::Add: case Instruction::Sub: - case Instruction::Mul: case Instruction::Xor: return true; default: diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index b49b4e4f3fd2d..bbe4d1f56c23d 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -960,20 +960,15 @@ DILexicalBlock *DIBuilder::createLexicalBlock(DIScope *Scope, DIFile *File, File, Line, Col); } -DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, - DIExpression *Expr, const DILocation *DL, - Instruction *InsertBefore) { - return insertDeclare(Storage, VarInfo, Expr, DL, InsertBefore->getParent(), - InsertBefore); -} - DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, DIExpression *Expr, const DILocation *DL, BasicBlock *InsertAtEnd) { // If this block already has a terminator then insert this intrinsic before // the terminator. Otherwise, put it at the end of the block. Instruction *InsertBefore = InsertAtEnd->getTerminator(); - return insertDeclare(Storage, VarInfo, Expr, DL, InsertAtEnd, InsertBefore); + return insertDeclare(Storage, VarInfo, Expr, DL, + InsertBefore ? InsertBefore->getIterator() + : InsertAtEnd->end()); } DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val, @@ -988,11 +983,10 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val, if (M.IsNewDbgInfoFormat) { DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign( Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL); - BasicBlock *InsertBB = LinkedInstr->getParent(); // Insert after LinkedInstr. BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator()); - Instruction *InsertBefore = NextIt == InsertBB->end() ? nullptr : &*NextIt; - insertDbgVariableRecord(DVR, InsertBB, InsertBefore, true); + NextIt.setHeadBit(true); + insertDbgVariableRecord(DVR, NextIt); return DVR; } @@ -1018,47 +1012,11 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val, return DVI; } -DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, - Instruction *InsertBefore) { - return insertLabel(LabelInfo, DL, - InsertBefore ? InsertBefore->getParent() : nullptr, - InsertBefore); -} - -DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, - BasicBlock *InsertAtEnd) { - return insertLabel(LabelInfo, DL, InsertAtEnd, nullptr); -} - -DbgInstPtr DIBuilder::insertDbgValueIntrinsic(Value *V, - DILocalVariable *VarInfo, - DIExpression *Expr, - const DILocation *DL, - Instruction *InsertBefore) { - DbgInstPtr DVI = insertDbgValueIntrinsic( - V, VarInfo, Expr, DL, InsertBefore ? InsertBefore->getParent() : nullptr, - InsertBefore); - if (auto *Inst = dyn_cast(DVI)) - cast(Inst)->setTailCall(); - return DVI; -} - -DbgInstPtr DIBuilder::insertDbgValueIntrinsic(Value *V, - DILocalVariable *VarInfo, - DIExpression *Expr, - const DILocation *DL, - BasicBlock *InsertAtEnd) { - return insertDbgValueIntrinsic(V, VarInfo, Expr, DL, InsertAtEnd, nullptr); -} - /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics. /// This abstracts over the various ways to specify an insert position. static void initIRBuilder(IRBuilder<> &Builder, const DILocation *DL, - BasicBlock *InsertBB, Instruction *InsertBefore) { - if (InsertBefore) - Builder.SetInsertPoint(InsertBefore); - else if (InsertBB) - Builder.SetInsertPoint(InsertBB); + InsertPosition InsertPt) { + Builder.SetInsertPoint(InsertPt.getBasicBlock(), InsertPt); Builder.SetCurrentDebugLocation(DL); } @@ -1071,26 +1029,28 @@ static Function *getDeclareIntrin(Module &M) { return Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_declare); } -DbgInstPtr DIBuilder::insertDbgValueIntrinsic( - llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr, - const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) { +DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val, + DILocalVariable *VarInfo, + DIExpression *Expr, + const DILocation *DL, + InsertPosition InsertPt) { if (M.IsNewDbgInfoFormat) { DbgVariableRecord *DVR = DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL); - insertDbgVariableRecord(DVR, InsertBB, InsertBefore); + insertDbgVariableRecord(DVR, InsertPt); return DVR; } if (!ValueFn) ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value); - return insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertBB, - InsertBefore); + auto *DVI = insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertPt); + cast(DVI)->setTailCall(); + return DVI; } DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, DIExpression *Expr, const DILocation *DL, - BasicBlock *InsertBB, - Instruction *InsertBefore) { + InsertPosition InsertPt) { assert(VarInfo && "empty or invalid DILocalVariable* passed to dbg.declare"); assert(DL && "Expected debug loc"); assert(DL->getScope()->getSubprogram() == @@ -1100,7 +1060,7 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, if (M.IsNewDbgInfoFormat) { DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL); - insertDbgVariableRecord(DVR, InsertBB, InsertBefore); + insertDbgVariableRecord(DVR, InsertPt); return DVR; } @@ -1114,35 +1074,27 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, MetadataAsValue::get(VMContext, Expr)}; IRBuilder<> B(DL->getContext()); - initIRBuilder(B, DL, InsertBB, InsertBefore); + initIRBuilder(B, DL, InsertPt); return B.CreateCall(DeclareFn, Args); } void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR, - BasicBlock *InsertBB, - Instruction *InsertBefore, - bool InsertAtHead) { - assert(InsertBefore || InsertBB); + InsertPosition InsertPt) { + assert(InsertPt.isValid()); trackIfUnresolved(DVR->getVariable()); trackIfUnresolved(DVR->getExpression()); if (DVR->isDbgAssign()) trackIfUnresolved(DVR->getAddressExpression()); - BasicBlock::iterator InsertPt; - if (InsertBB && InsertBefore) - InsertPt = InsertBefore->getIterator(); - else if (InsertBB) - InsertPt = InsertBB->end(); - InsertPt.setHeadBit(InsertAtHead); - InsertBB->insertDbgRecordBefore(DVR, InsertPt); + auto *BB = InsertPt.getBasicBlock(); + BB->insertDbgRecordBefore(DVR, InsertPt); } Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn, Value *V, DILocalVariable *VarInfo, DIExpression *Expr, const DILocation *DL, - BasicBlock *InsertBB, - Instruction *InsertBefore) { + InsertPosition InsertPt) { assert(IntrinsicFn && "must pass a non-null intrinsic function"); assert(V && "must pass a value to a dbg intrinsic"); assert(VarInfo && @@ -1159,13 +1111,12 @@ Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn, MetadataAsValue::get(VMContext, Expr)}; IRBuilder<> B(DL->getContext()); - initIRBuilder(B, DL, InsertBB, InsertBefore); + initIRBuilder(B, DL, InsertPt); return B.CreateCall(IntrinsicFn, Args); } DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, - BasicBlock *InsertBB, - Instruction *InsertBefore) { + InsertPosition InsertPt) { assert(LabelInfo && "empty or invalid DILabel* passed to dbg.label"); assert(DL && "Expected debug loc"); assert(DL->getScope()->getSubprogram() == @@ -1175,10 +1126,10 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, trackIfUnresolved(LabelInfo); if (M.IsNewDbgInfoFormat) { DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL); - if (InsertBB && InsertBefore) - InsertBB->insertDbgRecordBefore(DLR, InsertBefore->getIterator()); - else if (InsertBB) - InsertBB->insertDbgRecordBefore(DLR, InsertBB->end()); + if (InsertPt.isValid()) { + auto *BB = InsertPt.getBasicBlock(); + BB->insertDbgRecordBefore(DLR, InsertPt); + } return DLR; } @@ -1188,7 +1139,7 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)}; IRBuilder<> B(DL->getContext()); - initIRBuilder(B, DL, InsertBB, InsertBefore); + initIRBuilder(B, DL, InsertPt); return B.CreateCall(LabelFn, Args); } diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 4ce518009bd3e..cc36b71190ce2 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -798,7 +798,11 @@ class DebugTypeInfoRemoval { return getReplacementMDNode(N); }; - Replacements[N] = doRemap(N); + // Seperate recursive doRemap and operator [] into 2 lines to avoid + // out-of-order evaluations since both of them can access the same memory + // location in map Replacements. + auto Value = doRemap(N); + Replacements[N] = Value; } /// Do the remapping traversal. @@ -1686,7 +1690,8 @@ LLVMDbgRecordRef LLVMDIBuilderInsertDeclareRecordBefore( DbgInstPtr DbgInst = unwrap(Builder)->insertDeclare( unwrap(Storage), unwrap(VarInfo), unwrap(Expr), unwrap(DL), - unwrap(Instr)); + Instr ? InsertPosition(unwrap(Instr)->getIterator()) + : nullptr); // This assert will fail if the module is in the old debug info format. // This function should only be called if the module is in the new // debug info format. @@ -1718,7 +1723,9 @@ LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordBefore( LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr) { DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic( unwrap(Val), unwrap(VarInfo), unwrap(Expr), - unwrap(DebugLoc), unwrap(Instr)); + unwrap(DebugLoc), + Instr ? InsertPosition(unwrap(Instr)->getIterator()) + : nullptr); // This assert will fail if the module is in the old debug info format. // This function should only be called if the module is in the new // debug info format. @@ -1734,7 +1741,8 @@ LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordAtEnd( LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block) { DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic( unwrap(Val), unwrap(VarInfo), unwrap(Expr), - unwrap(DebugLoc), unwrap(Block)); + unwrap(DebugLoc), + Block ? InsertPosition(unwrap(Block)->end()) : nullptr); // This assert will fail if the module is in the old debug info format. // This function should only be called if the module is in the new // debug info format. @@ -1800,21 +1808,25 @@ void LLVMInstructionSetDebugLoc(LLVMValueRef Inst, LLVMMetadataRef Loc) { unwrap(Inst)->setDebugLoc(DebugLoc()); } -LLVMMetadataRef LLVMDIBuilderCreateLabel( - LLVMDIBuilderRef Builder, - LLVMMetadataRef Context, const char *Name, size_t NameLen, - LLVMMetadataRef File, unsigned LineNo, LLVMBool AlwaysPreserve) { +LLVMMetadataRef LLVMDIBuilderCreateLabel(LLVMDIBuilderRef Builder, + LLVMMetadataRef Context, + const char *Name, size_t NameLen, + LLVMMetadataRef File, unsigned LineNo, + LLVMBool AlwaysPreserve) { return wrap(unwrap(Builder)->createLabel( - unwrapDI(Context), StringRef(Name, NameLen), - unwrapDI(File), LineNo, AlwaysPreserve)); + unwrapDI(Context), StringRef(Name, NameLen), + unwrapDI(File), LineNo, AlwaysPreserve)); } -LLVMDbgRecordRef LLVMDIBuilderInsertLabelBefore( - LLVMDIBuilderRef Builder, LLVMMetadataRef LabelInfo, - LLVMMetadataRef Location, LLVMValueRef InsertBefore) { +LLVMDbgRecordRef LLVMDIBuilderInsertLabelBefore(LLVMDIBuilderRef Builder, + LLVMMetadataRef LabelInfo, + LLVMMetadataRef Location, + LLVMValueRef InsertBefore) { DbgInstPtr DbgInst = unwrap(Builder)->insertLabel( - unwrapDI(LabelInfo), unwrapDI(Location), - unwrap(InsertBefore)); + unwrapDI(LabelInfo), unwrapDI(Location), + InsertBefore + ? InsertPosition(unwrap(InsertBefore)->getIterator()) + : nullptr); // This assert will fail if the module is in the old debug info format. // This function should only be called if the module is in the new // debug info format. @@ -1825,12 +1837,13 @@ LLVMDbgRecordRef LLVMDIBuilderInsertLabelBefore( return wrap(cast(DbgInst)); } -LLVMDbgRecordRef LLVMDIBuilderInsertLabelAtEnd( - LLVMDIBuilderRef Builder, LLVMMetadataRef LabelInfo, - LLVMMetadataRef Location, LLVMBasicBlockRef InsertAtEnd) { +LLVMDbgRecordRef LLVMDIBuilderInsertLabelAtEnd(LLVMDIBuilderRef Builder, + LLVMMetadataRef LabelInfo, + LLVMMetadataRef Location, + LLVMBasicBlockRef InsertAtEnd) { DbgInstPtr DbgInst = unwrap(Builder)->insertLabel( - unwrapDI(LabelInfo), unwrapDI(Location), - unwrap(InsertAtEnd)); + unwrapDI(LabelInfo), unwrapDI(Location), + InsertAtEnd ? InsertPosition(unwrap(InsertAtEnd)->end()) : nullptr); // This assert will fail if the module is in the old debug info format. // This function should only be called if the module is in the new // debug info format. diff --git a/llvm/lib/IR/DroppedVariableStats.cpp b/llvm/lib/IR/DroppedVariableStats.cpp new file mode 100644 index 0000000000000..d80ac9339a1b2 --- /dev/null +++ b/llvm/lib/IR/DroppedVariableStats.cpp @@ -0,0 +1,147 @@ +///===- DroppedVariableStats.cpp ----------------------------------------===// +/// +/// Part of the LLVM Project, under the Apache License v2.0 with LLVM +/// Exceptions. See https://llvm.org/LICENSE.txt for license information. +/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +/// +///===---------------------------------------------------------------------===// +/// \file +/// Dropped Variable Statistics for Debug Information. Reports any number +/// of #dbg_value that get dropped due to an optimization pass. +/// +///===---------------------------------------------------------------------===// + +#include "llvm/IR/DroppedVariableStats.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" + +using namespace llvm; + +DroppedVariableStats::DroppedVariableStats(bool DroppedVarStatsEnabled) + : DroppedVariableStatsEnabled(DroppedVarStatsEnabled) { + if (DroppedVarStatsEnabled) + llvm::outs() << "Pass Level, Pass Name, Num of Dropped Variables, Func or " + "Module Name\n"; +} + +void DroppedVariableStats::setup() { + DebugVariablesStack.push_back({DenseMap()}); + InlinedAts.push_back({DenseMap>()}); +} + +void DroppedVariableStats::cleanup() { + assert(!DebugVariablesStack.empty() && + "DebugVariablesStack shouldn't be empty!"); + assert(!InlinedAts.empty() && "InlinedAts shouldn't be empty!"); + DebugVariablesStack.pop_back(); + InlinedAts.pop_back(); +} + +void DroppedVariableStats::calculateDroppedStatsAndPrint( + DebugVariables &DbgVariables, StringRef FuncName, StringRef PassID, + StringRef FuncOrModName, StringRef PassLevel, const Function *Func) { + unsigned DroppedCount = 0; + DenseSet &DebugVariablesBeforeSet = DbgVariables.DebugVariablesBefore; + DenseSet &DebugVariablesAfterSet = DbgVariables.DebugVariablesAfter; + if (InlinedAts.back().find(FuncName) == InlinedAts.back().end()) + return; + DenseMap &InlinedAtsMap = InlinedAts.back()[FuncName]; + // Find an Instruction that shares the same scope as the dropped #dbg_value + // or has a scope that is the child of the scope of the #dbg_value, and has + // an inlinedAt equal to the inlinedAt of the #dbg_value or it's inlinedAt + // chain contains the inlinedAt of the #dbg_value, if such an Instruction is + // found, debug information is dropped. + for (VarID Var : DebugVariablesBeforeSet) { + if (DebugVariablesAfterSet.contains(Var)) + continue; + visitEveryInstruction(DroppedCount, InlinedAtsMap, Var); + removeVarFromAllSets(Var, Func); + } + if (DroppedCount > 0) { + llvm::outs() << PassLevel << ", " << PassID << ", " << DroppedCount << ", " + << FuncOrModName << "\n"; + PassDroppedVariables = true; + } else + PassDroppedVariables = false; +} + +bool DroppedVariableStats::updateDroppedCount( + DILocation *DbgLoc, const DIScope *Scope, const DIScope *DbgValScope, + DenseMap &InlinedAtsMap, VarID Var, + unsigned &DroppedCount) { + // If the Scope is a child of, or equal to the DbgValScope and is inlined at + // the Var's InlinedAt location, return true to signify that the Var has + // been dropped. + if (isScopeChildOfOrEqualTo(Scope, DbgValScope)) + if (isInlinedAtChildOfOrEqualTo(DbgLoc->getInlinedAt(), + InlinedAtsMap[Var])) { + // Found another instruction in the variable's scope, so there exists a + // break point at which the variable could be observed. Count it as + // dropped. + DroppedCount++; + return true; + } + return false; +} + +void DroppedVariableStats::run(DebugVariables &DbgVariables, StringRef FuncName, + bool Before) { + auto &VarIDSet = (Before ? DbgVariables.DebugVariablesBefore + : DbgVariables.DebugVariablesAfter); + auto &InlinedAtsMap = InlinedAts.back(); + if (Before) + InlinedAtsMap.try_emplace(FuncName, DenseMap()); + VarIDSet = DenseSet(); + visitEveryDebugRecord(VarIDSet, InlinedAtsMap, FuncName, Before); +} + +void DroppedVariableStats::populateVarIDSetAndInlinedMap( + const DILocalVariable *DbgVar, DebugLoc DbgLoc, DenseSet &VarIDSet, + DenseMap> &InlinedAtsMap, + StringRef FuncName, bool Before) { + VarID Key{DbgVar->getScope(), DbgLoc->getInlinedAtScope(), DbgVar}; + VarIDSet.insert(Key); + if (Before) + InlinedAtsMap[FuncName].try_emplace(Key, DbgLoc.getInlinedAt()); +} + +void DroppedVariableStats::removeVarFromAllSets(VarID Var, const Function *F) { + // Do not remove Var from the last element, it will be popped from the + // stack. + for (auto &DebugVariablesMap : llvm::drop_end(DebugVariablesStack)) + DebugVariablesMap[F].DebugVariablesBefore.erase(Var); +} + +bool DroppedVariableStats::isScopeChildOfOrEqualTo(const DIScope *Scope, + const DIScope *DbgValScope) { + while (Scope != nullptr) { + if (VisitedScope.find(Scope) == VisitedScope.end()) { + VisitedScope.insert(Scope); + if (Scope == DbgValScope) { + VisitedScope.clear(); + return true; + } + Scope = Scope->getScope(); + } else { + VisitedScope.clear(); + return false; + } + } + return false; +} + +bool DroppedVariableStats::isInlinedAtChildOfOrEqualTo( + const DILocation *InlinedAt, const DILocation *DbgValInlinedAt) { + if (DbgValInlinedAt == InlinedAt) + return true; + if (!DbgValInlinedAt) + return false; + auto *IA = InlinedAt; + while (IA) { + if (IA == DbgValInlinedAt) + return true; + IA = IA->getInlinedAt(); + } + return false; +} diff --git a/llvm/lib/Passes/DroppedVariableStatsIR.cpp b/llvm/lib/IR/DroppedVariableStatsIR.cpp similarity index 70% rename from llvm/lib/Passes/DroppedVariableStatsIR.cpp rename to llvm/lib/IR/DroppedVariableStatsIR.cpp index e1c277e87efb3..382a024c3ee15 100644 --- a/llvm/lib/Passes/DroppedVariableStatsIR.cpp +++ b/llvm/lib/IR/DroppedVariableStatsIR.cpp @@ -11,10 +11,48 @@ /// ///===---------------------------------------------------------------------===// -#include "llvm/Passes/DroppedVariableStatsIR.h" +#include "llvm/IR/DroppedVariableStatsIR.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassInstrumentation.h" using namespace llvm; +template +const IRUnitT *DroppedVariableStatsIR::unwrapIR(Any IR) { + const IRUnitT **IRPtr = llvm::any_cast(&IR); + return IRPtr ? *IRPtr : nullptr; +} + +void DroppedVariableStatsIR::runBeforePass(StringRef P, Any IR) { + setup(); + if (const auto *M = unwrapIR(IR)) + return this->runOnModule(P, M, true); + if (const auto *F = unwrapIR(IR)) + return this->runOnFunction(P, F, true); +} + +void DroppedVariableStatsIR::runAfterPass(StringRef P, Any IR) { + if (const auto *M = unwrapIR(IR)) + runAfterPassModule(P, M); + else if (const auto *F = unwrapIR(IR)) + runAfterPassFunction(P, F); + cleanup(); +} + +void DroppedVariableStatsIR::runAfterPassFunction(StringRef PassID, + const Function *F) { + runOnFunction(PassID, F, false); + calculateDroppedVarStatsOnFunction(F, PassID, F->getName().str(), "Function"); +} + +void DroppedVariableStatsIR::runAfterPassModule(StringRef PassID, + const Module *M) { + runOnModule(PassID, M, false); + calculateDroppedVarStatsOnModule(M, PassID, M->getName().str(), "Module"); +} + void DroppedVariableStatsIR::runOnFunction(StringRef PassID, const Function *F, bool Before) { auto &DebugVariables = DebugVariablesStack.back()[F]; diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 335febde3687c..09d4b518cc1c5 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -636,7 +636,7 @@ void MCContext::recordELFMergeableSectionInfo(StringRef SectionName, unsigned Flags, unsigned UniqueID, unsigned EntrySize) { bool IsMergeable = Flags & ELF::SHF_MERGE; - if (UniqueID == GenericSectionID) { + if (UniqueID == MCSection::NonUniqueID) { ELFSeenGenericMergeableSections.insert(SectionName); // Minor performance optimization: avoid hash map lookup in // isELFGenericMergeableSection, which will return true for SectionName. @@ -727,14 +727,15 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section, MCSectionCOFF *MCContext::getCOFFSection(StringRef Section, unsigned Characteristics) { - return getCOFFSection(Section, Characteristics, "", 0, GenericSectionID); + return getCOFFSection(Section, Characteristics, "", 0, + MCSection::NonUniqueID); } MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec, const MCSymbol *KeySym, unsigned UniqueID) { // Return the normal section if we don't have to be associative or unique. - if (!KeySym && UniqueID == GenericSectionID) + if (!KeySym && UniqueID == MCSection::NonUniqueID) return Sec; // If we have a key symbol, make an associative section with the same name and diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index 150e38a94db6a..ab7552ca01061 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -1066,7 +1066,7 @@ MCSection *MCObjectFileInfo::getDwarfComdatSection(const char *Name, utostr(Hash), /*IsComdat=*/true); case Triple::Wasm: return Ctx->getWasmSection(Name, SectionKind::getMetadata(), 0, - utostr(Hash), MCContext::GenericSectionID); + utostr(Hash), MCSection::NonUniqueID); case Triple::MachO: case Triple::COFF: case Triple::GOFF: diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp index d8ab30f296c3c..a3c40d6194607 100644 --- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp +++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp @@ -193,7 +193,7 @@ class WasmAsmParser : public MCAsmParserExtension { // TODO: Parse UniqueID MCSectionWasm *WS = getContext().getWasmSection( - Name, *Kind, Flags, GroupName, MCContext::GenericSectionID); + Name, *Kind, Flags, GroupName, MCSection::NonUniqueID); if (WS->getSegmentFlags() != Flags) Parser->Error(loc, "changed section flags for " + Name + diff --git a/llvm/lib/Object/GOFFObjectFile.cpp b/llvm/lib/Object/GOFFObjectFile.cpp index db1e7e704f62e..7806953aecd29 100644 --- a/llvm/lib/Object/GOFFObjectFile.cpp +++ b/llvm/lib/Object/GOFFObjectFile.cpp @@ -564,8 +564,7 @@ section_iterator GOFFObjectFile::section_end() const { void GOFFObjectFile::moveSymbolNext(DataRefImpl &Symb) const { for (uint32_t I = Symb.d.a + 1, E = EsdPtrs.size(); I < E; ++I) { - if (EsdPtrs[I]) { - const uint8_t *EsdRecord = EsdPtrs[I]; + if (const uint8_t *EsdRecord = EsdPtrs[I]) { GOFF::ESDSymbolType SymbolType; ESDRecord::getSymbolType(EsdRecord, SymbolType); // Skip EDs - i.e. section symbols. diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt index 23799ac4f98f7..6425f4934b210 100644 --- a/llvm/lib/Passes/CMakeLists.txt +++ b/llvm/lib/Passes/CMakeLists.txt @@ -1,6 +1,5 @@ add_llvm_component_library(LLVMPasses CodeGenPassBuilder.cpp - DroppedVariableStatsIR.cpp OptimizationLevel.cpp PassBuilder.cpp PassBuilderBindings.cpp diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index e7ba7213a76fe..650d23ac1d5ef 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -119,6 +119,7 @@ #include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/MachineVerifier.h" #include "llvm/CodeGen/OptimizePHIs.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 63e70d7e182bd..df59a76772e20 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -410,6 +410,19 @@ static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { Phase == ThinOrFullLTOPhase::FullLTOPreLink; } +// Helper to wrap conditionally Coro passes. +static CoroConditionalWrapper buildCoroWrapper(ThinOrFullLTOPhase Phase) { + // TODO: Skip passes according to Phase. + ModulePassManager CoroPM; + CoroPM.addPass(CoroEarlyPass()); + CGSCCPassManager CGPM; + CGPM.addPass(CoroSplitPass()); + CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); + CoroPM.addPass(CoroCleanupPass()); + CoroPM.addPass(GlobalDCEPass()); + return CoroConditionalWrapper(std::move(CoroPM)); +} + // TODO: Investigate the cost/benefit of tail call elimination on debugging. FunctionPassManager PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, @@ -1827,6 +1840,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, lowertypetests::DropTestKind::Assume)); + MPM.addPass(buildCoroWrapper(ThinOrFullLTOPhase::FullLTOPostLink)); + invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); // Emit annotation remarks. @@ -1911,6 +1926,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, lowertypetests::DropTestKind::Assume)); + MPM.addPass(buildCoroWrapper(ThinOrFullLTOPhase::FullLTOPostLink)); + invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); // Emit annotation remarks. @@ -1919,6 +1936,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, return MPM; } + // TODO: Skip to match buildCoroWrapper. + MPM.addPass(CoroEarlyPass()); + // Optimize globals to try and fold them into constants. MPM.addPass(GlobalOptPass()); @@ -1984,7 +2004,11 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // If we didn't decide to inline a function, check to see if we can // transform it to pass arguments by value instead of by reference. - MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass())); + CGSCCPassManager CGPM; + CGPM.addPass(ArgumentPromotionPass()); + CGPM.addPass(CoroSplitPass(Level != OptimizationLevel::O0)); + CGPM.addPass(CoroAnnotationElidePass()); + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); FunctionPassManager FPM; // The IPO Passes may leave cruft around. Clean up after them. @@ -2136,6 +2160,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, if (PTO.CallGraphProfile) MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true)); + MPM.addPass(CoroCleanupPass()); + invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); // Emit annotation remarks. @@ -2250,14 +2276,7 @@ PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } - ModulePassManager CoroPM; - CoroPM.addPass(CoroEarlyPass()); - CGSCCPassManager CGPM; - CGPM.addPass(CoroSplitPass()); - CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); - CoroPM.addPass(CoroCleanupPass()); - CoroPM.addPass(GlobalDCEPass()); - MPM.addPass(CoroConditionalWrapper(std::move(CoroPM))); + MPM.addPass(buildCoroWrapper(Phase)); invokeOptimizerLastEPCallbacks(MPM, Level, Phase); diff --git a/llvm/lib/Support/ModRef.cpp b/llvm/lib/Support/ModRef.cpp index d3b3dd11171f1..2bb9bc945bd2e 100644 --- a/llvm/lib/Support/ModRef.cpp +++ b/llvm/lib/Support/ModRef.cpp @@ -43,6 +43,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, MemoryEffects ME) { case IRMemLocation::InaccessibleMem: OS << "InaccessibleMem: "; break; + case IRMemLocation::ErrnoMem: + OS << "ErrnoMem: "; + break; case IRMemLocation::Other: OS << "Other: "; break; diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc index 550b0de2e0455..b5c3719f57963 100644 --- a/llvm/lib/Support/Unix/Process.inc +++ b/llvm/lib/Support/Unix/Process.inc @@ -31,6 +31,9 @@ #ifdef HAVE_MALLOC_MALLOC_H #include #endif +#ifdef HAVE_GETAUXVAL +#include +#endif //===----------------------------------------------------------------------===// //=== WARNING: Implementation here must contain only generic UNIX code that @@ -63,7 +66,9 @@ Process::Pid Process::getProcessId() { // On Cygwin, getpagesize() returns 64k(AllocationGranularity) and // offset in mmap(3) should be aligned to the AllocationGranularity. Expected Process::getPageSize() { -#if defined(HAVE_GETPAGESIZE) +#if defined(HAVE_GETAUXVAL) + static const int page_size = ::getauxval(AT_PAGESZ); +#elif defined(HAVE_GETPAGESIZE) static const int page_size = ::getpagesize(); #elif defined(HAVE_SYSCONF) static long page_size = ::sysconf(_SC_PAGE_SIZE); @@ -73,6 +78,8 @@ Expected Process::getPageSize() { if (page_size == -1) return errorCodeToError(errnoAsErrorCode()); + assert(page_size > 0 && "Page size cannot be 0"); + assert((page_size % 1024) == 0 && "Page size must be aligned by 1024"); return static_cast(page_size); } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index d3abd79b85a75..d118022395762 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2612,8 +2612,13 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, } } -bool AArch64FrameLowering::enableCFIFixup(MachineFunction &MF) const { +bool AArch64FrameLowering::enableCFIFixup(const MachineFunction &MF) const { return TargetFrameLowering::enableCFIFixup(MF) && + MF.getInfo()->needsDwarfUnwindInfo(MF); +} + +bool AArch64FrameLowering::enableFullCFIFixup(const MachineFunction &MF) const { + return enableCFIFixup(MF) && MF.getInfo()->needsAsyncDwarfUnwindInfo(MF); } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 8f84702f4d2ba..e7d52bb350f13 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -36,7 +36,9 @@ class AArch64FrameLowering : public TargetFrameLowering { void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - bool enableCFIFixup(MachineFunction &MF) const override; + bool enableCFIFixup(const MachineFunction &MF) const override; + + bool enableFullCFIFixup(const MachineFunction &MF) const override; bool canUseAsPrologue(const MachineBasicBlock &MBB) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0d1608a97bfd3..4263be1098899 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15508,7 +15508,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { unsigned EltSize = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); - return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + return (ShuffleVectorSDNode::isSplatMask(M) || isREVMask(M, EltSize, NumElts, 64) || isREVMask(M, EltSize, NumElts, 32) || isREVMask(M, EltSize, NumElts, 16) || diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index d13bb908df78e..03ae42493a035 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -12538,26 +12538,19 @@ multiclass STOPregister { !cast(instr # "X")>; } +let Predicates = [HasLSUI] in class BaseSTOPregisterLSUI : - InstAlias; + InstAlias; multiclass STOPregisterLSUI { - def : BaseSTOPregisterLSUI(instr # "W")>; - def : BaseSTOPregisterLSUI(instr # "X")>; - def : BaseSTOPregisterLSUI(instr # "W")>; - def : BaseSTOPregisterLSUI(instr # "X")>; - def : BaseSTOPregisterLSUI(instr # "W")>; - def : BaseSTOPregisterLSUI(instr # "X")>; - def : BaseSTOPregisterLSUI(instr # "LW")>; + def : BaseSTOPregisterLSUI(instr # "LX")>; + def : BaseSTOPregisterLSUI(instr # "W")>; - def : BaseSTOPregisterLSUI(instr # "X")>; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 658ac7490eb33..c45b311b6ebb2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -2665,21 +2665,11 @@ defm CASLT : CompareAndSwapUnprivileged<0b11, 0, 1, "l">; defm CASAT : CompareAndSwapUnprivileged<0b11, 1, 0, "a">; defm CASALT : CompareAndSwapUnprivileged<0b11, 1, 1, "al">; -def : MnemonicAlias<"cas", "cast">; -def : MnemonicAlias<"casl", "caslt">; -def : MnemonicAlias<"casa", "casat">; -def : MnemonicAlias<"casal", "casalt">; - // v9.6-a atomic CASPT defm CASPT : CompareAndSwapPairUnprivileged<0b01, 0, 0, "">; defm CASPLT : CompareAndSwapPairUnprivileged<0b01, 0, 1, "l">; defm CASPAT : CompareAndSwapPairUnprivileged<0b01, 1, 0, "a">; defm CASPALT : CompareAndSwapPairUnprivileged<0b01, 1, 1, "al">; - -def : MnemonicAlias<"casp", "caspt">; -def : MnemonicAlias<"caspl", "casplt">; -def : MnemonicAlias<"caspa", "caspat">; -def : MnemonicAlias<"caspal", "caspalt">; } // v8.1 atomic SWP @@ -2694,11 +2684,6 @@ let Predicates = [HasLSUI] in { defm SWPTA : SwapLSUI<1, 0, "a">; defm SWPTL : SwapLSUI<0, 1, "l">; defm SWPTAL : SwapLSUI<1, 1, "al">; - - def : MnemonicAlias<"swp", "swpt">; - def : MnemonicAlias<"swpa", "swpta">; - def : MnemonicAlias<"swpl", "swptl">; - def : MnemonicAlias<"swpal", "swptal">; } // v9.6-a unprivileged atomic LD (FEAT_LSUI) @@ -4863,22 +4848,14 @@ let Predicates = [HasLSUI] in { defm LDTXRW : LoadUnprivilegedLSUI<0b10, GPR32, "ldtxr">; defm LDTXRX : LoadUnprivilegedLSUI<0b11, GPR64, "ldtxr">; -def : MnemonicAlias<"ldxr", "ldtxr">; - def LDATXRW : LoadExclusiveLSUI <0b10, 1, 1, GPR32, "ldatxr">; def LDATXRX : LoadExclusiveLSUI <0b11, 1, 1, GPR64, "ldatxr">; -def : MnemonicAlias<"ldaxr", "ldatxr">; - defm STTXRW : StoreUnprivilegedLSUI<0b10, GPR32, "sttxr">; defm STTXRX : StoreUnprivilegedLSUI<0b11, GPR64, "sttxr">; -def : MnemonicAlias<"stxr", "sttxr">; - def STLTXRW : StoreExclusiveLSUI<0b10, 0, 1, GPR32, "stltxr">; def STLTXRX : StoreExclusiveLSUI<0b11, 0, 1, GPR64, "stltxr">; - -def : MnemonicAlias<"stlxr", "stltxr">; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index bd0d55f571234..a40e4e563843e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4683,7 +4683,9 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( InstructionCost Invalid = InstructionCost::getInvalid(); InstructionCost Cost(TTI::TCC_Basic); - if (Opcode != Instruction::Add) + // Sub opcodes currently only occur in chained cases. + // Independent partial reduction subtractions are still costed as an add + if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return Invalid; if (InputTypeA != InputTypeB) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 67bad5884c260..de3253e64b978 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -460,7 +460,7 @@ extern char &GCNPreRAOptimizationsID; FunctionPass *createAMDGPUSetWavePriorityPass(); void initializeAMDGPUSetWavePriorityPass(PassRegistry &); -void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &); +void initializeGCNRewritePartialRegUsesLegacyPass(llvm::PassRegistry &); extern char &GCNRewritePartialRegUsesID; void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index bb00442342d84..478a4c161fce7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -816,7 +816,7 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, Register InputReg = MRI.createGenericVirtualRegister(ArgTy); if (IncomingArg) { - LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); + LI->buildLoadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) { @@ -883,8 +883,9 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, NeedWorkItemIDX) { if (ST.getMaxWorkitemID(MF.getFunction(), 0) != 0) { InputReg = MRI.createGenericVirtualRegister(S32); - LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, - std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); + LI->buildLoadInputValue(InputReg, MIRBuilder, IncomingArgX, + std::get<1>(WorkitemIDX), + std::get<2>(WorkitemIDX)); } else { InputReg = MIRBuilder.buildConstant(S32, 0).getReg(0); } @@ -893,8 +894,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && NeedWorkItemIDY && ST.getMaxWorkitemID(MF.getFunction(), 1) != 0) { Register Y = MRI.createGenericVirtualRegister(S32); - LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY), - std::get<2>(WorkitemIDY)); + LI->buildLoadInputValue(Y, MIRBuilder, IncomingArgY, + std::get<1>(WorkitemIDY), std::get<2>(WorkitemIDY)); Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0); InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y; @@ -903,8 +904,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && NeedWorkItemIDZ && ST.getMaxWorkitemID(MF.getFunction(), 2) != 0) { Register Z = MRI.createGenericVirtualRegister(S32); - LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ), - std::get<2>(WorkitemIDZ)); + LI->buildLoadInputValue(Z, MIRBuilder, IncomingArgZ, + std::get<1>(WorkitemIDZ), std::get<2>(WorkitemIDZ)); Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0); InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; @@ -925,8 +926,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, ArgDescriptor IncomingArg = ArgDescriptor::createArg( IncomingArgX ? *IncomingArgX : IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); - LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, - &AMDGPU::VGPR_32RegClass, S32); + LI->buildLoadInputValue(InputReg, MIRBuilder, &IncomingArg, + &AMDGPU::VGPR_32RegClass, S32); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index e9e47eaadd557..908d323c7fec9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4275,10 +4275,11 @@ verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, return UseMI; } -bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, - const ArgDescriptor *Arg, - const TargetRegisterClass *ArgRC, - LLT ArgTy) const { +void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg, + MachineIRBuilder &B, + const ArgDescriptor *Arg, + const TargetRegisterClass *ArgRC, + LLT ArgTy) const { MCRegister SrcReg = Arg->getRegister(); assert(SrcReg.isPhysical() && "Physical register expected"); assert(DstReg.isVirtual() && "Virtual register expected"); @@ -4304,8 +4305,6 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, } else { B.buildCopy(DstReg, LiveIn); } - - return true; } bool AMDGPULegalizerInfo::loadInputValue( @@ -4369,7 +4368,8 @@ bool AMDGPULegalizerInfo::loadInputValue( if (!Arg->isRegister() || !Arg->getRegister().isValid()) return false; // TODO: Handle these - return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); + buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy); + return true; } bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 86c15197805d2..03b7c36fc450f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -111,9 +111,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo { bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; - bool loadInputValue(Register DstReg, MachineIRBuilder &B, - const ArgDescriptor *Arg, - const TargetRegisterClass *ArgRC, LLT ArgTy) const; + void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, + const ArgDescriptor *Arg, + const TargetRegisterClass *ArgRC, LLT ArgTy) const; bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index ccb874e6a934e..b0b6c4df8e982 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -1804,14 +1804,25 @@ PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { bool IsNUW = GEP.hasNoUnsignedWrap(); bool IsNUSW = GEP.hasNoUnsignedSignedWrap(); + StructType *ResTy = cast(GEP.getType()); + Type *ResRsrcTy = ResTy->getElementType(0); + VectorType *ResRsrcVecTy = dyn_cast(ResRsrcTy); + bool BroadcastsPtr = ResRsrcVecTy && !isa(Off->getType()); + // In order to call emitGEPOffset() and thus not have to reimplement it, // we need the GEP result to have ptr addrspace(7) type. - Type *FatPtrTy = IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER); - if (auto *VT = dyn_cast(Off->getType())) - FatPtrTy = VectorType::get(FatPtrTy, VT->getElementCount()); + Type *FatPtrTy = + ResRsrcTy->getWithNewType(IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER)); GEP.mutateType(FatPtrTy); Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP); - GEP.mutateType(Ptr->getType()); + GEP.mutateType(ResTy); + + if (BroadcastsPtr) { + Rsrc = IRB.CreateVectorSplat(ResRsrcVecTy->getElementCount(), Rsrc, + Rsrc->getName()); + Off = IRB.CreateVectorSplat(ResRsrcVecTy->getElementCount(), Off, + Off->getName()); + } if (match(OffAccum, m_Zero())) { // Constant-zero offset SplitUsers.insert(&GEP); return {Rsrc, Off}; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index a5bfdb7bf6eac..57289c3e8bbf4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -367,6 +367,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { case Intrinsic::amdgcn_wave_barrier: case Intrinsic::amdgcn_sched_barrier: case Intrinsic::amdgcn_sched_group_barrier: + case Intrinsic::amdgcn_iglp_opt: return false; default: break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index d9d97928062f5..14b35a4fd8327 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -98,6 +98,7 @@ FUNCTION_PASS_WITH_PARAMS( #endif MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this)) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass()) +MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass()) @@ -119,6 +120,7 @@ MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass()) #define DUMMY_MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) +DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 8b387a191efba..7c0f1040a8156 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -34,6 +34,7 @@ #include "GCNIterativeScheduler.h" #include "GCNPreRALongBranchReg.h" #include "GCNPreRAOptimizations.h" +#include "GCNRewritePartialRegUses.h" #include "GCNSchedStrategy.h" #include "GCNVOPDUtils.h" #include "R600.h" @@ -70,6 +71,7 @@ #include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MachineCSE.h" #include "llvm/CodeGen/MachineLICM.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -550,7 +552,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeGCNNSAReassignPass(*PR); initializeGCNPreRAOptimizationsLegacyPass(*PR); initializeGCNPreRALongBranchRegLegacyPass(*PR); - initializeGCNRewritePartialRegUsesPass(*PR); + initializeGCNRewritePartialRegUsesLegacyPass(*PR); initializeGCNRegPressurePrinterPass(*PR); initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); @@ -1933,6 +1935,7 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder( GCNTargetMachine &TM, const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC) : CodeGenPassBuilder(TM, Opts, PIC) { + Opt.MISchedPostRA = true; Opt.RequiresCodeGenSCCOrder = true; // Exceptions and StackMaps are not supported, so these passes will never do // anything. diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 4b6d02fff4aec..54ed3789326cb 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1783,7 +1783,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool validateMIMGMSAA(const MCInst &Inst); bool validateOpSel(const MCInst &Inst); bool validateTrue16OpSel(const MCInst &Inst); - bool validateNeg(const MCInst &Inst, int OpName); + bool validateNeg(const MCInst &Inst, AMDGPU::OpName OpName); bool validateDPP(const MCInst &Inst, const OperandVector &Operands); bool validateVccOperand(MCRegister Reg) const; bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands); @@ -3959,8 +3959,9 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); - int RSrcOpName = (Desc.TSFlags & SIInstrFlags::MIMG) ? AMDGPU::OpName::srsrc - : AMDGPU::OpName::rsrc; + AMDGPU::OpName RSrcOpName = (Desc.TSFlags & SIInstrFlags::MIMG) + ? AMDGPU::OpName::srsrc + : AMDGPU::OpName::rsrc; int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RSrcOpName); int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim); int A16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::a16); @@ -4671,8 +4672,8 @@ bool AMDGPUAsmParser::validateTrue16OpSel(const MCInst &Inst) { if (OpSelOpValue == 0) return true; unsigned OpCount = 0; - for (int OpName : {AMDGPU::OpName::src0, AMDGPU::OpName::src1, - AMDGPU::OpName::src2, AMDGPU::OpName::vdst}) { + for (AMDGPU::OpName OpName : {AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2, AMDGPU::OpName::vdst}) { int OpIdx = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), OpName); if (OpIdx == -1) continue; @@ -4690,7 +4691,7 @@ bool AMDGPUAsmParser::validateTrue16OpSel(const MCInst &Inst) { return true; } -bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) { +bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, AMDGPU::OpName OpName) { assert(OpName == AMDGPU::OpName::neg_lo || OpName == AMDGPU::OpName::neg_hi); const unsigned Opc = Inst.getOpcode(); @@ -4715,9 +4716,9 @@ bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) { // It is convenient that such instructions don't have src_modifiers operand // for src operands that don't allow neg because they also don't allow opsel. - int SrcMods[3] = {AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers, - AMDGPU::OpName::src2_modifiers}; + const AMDGPU::OpName SrcMods[3] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; for (unsigned i = 0; i < 3; ++i) { if (!AMDGPU::hasNamedOperand(Opc, SrcMods[i])) { @@ -4844,9 +4845,9 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst, } // Returns -1 if not a register, 0 if VGPR and 1 if AGPR. -static int IsAGPROperand(const MCInst &Inst, uint16_t NameIdx, +static int IsAGPROperand(const MCInst &Inst, AMDGPU::OpName Name, const MCRegisterInfo *MRI) { - int OpIdx = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), NameIdx); + int OpIdx = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), Name); if (OpIdx < 0) return -1; @@ -4867,12 +4868,13 @@ bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const { SIInstrFlags::DS)) == 0) return true; - uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 - : AMDGPU::OpName::vdata; + AMDGPU::OpName DataName = (TSFlags & SIInstrFlags::DS) + ? AMDGPU::OpName::data0 + : AMDGPU::OpName::vdata; const MCRegisterInfo *MRI = getMRI(); int DstAreg = IsAGPROperand(Inst, AMDGPU::OpName::vdst, MRI); - int DataAreg = IsAGPROperand(Inst, DataNameIdx, MRI); + int DataAreg = IsAGPROperand(Inst, DataName, MRI); if ((TSFlags & SIInstrFlags::DS) && DataAreg >= 0) { int Data2Areg = IsAGPROperand(Inst, AMDGPU::OpName::data1, MRI); @@ -5876,6 +5878,18 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT, ExprVal, ValRange); + } else if (ID == ".amdhsa_inst_pref_size") { + if (IVersion.Major < 11) + return Error(IDRange.Start, "directive requires gfx11+", IDRange); + if (IVersion.Major == 11) { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, + COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE, ExprVal, + ValRange); + } else { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, + COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE, ExprVal, + ValRange); + } } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") { PARSE_BITS_ENTRY( KD.compute_pgm_rsrc2, @@ -8691,9 +8705,8 @@ static void cvtVOP3DstOpSelOnly(MCInst &Inst, const MCRegisterInfo &MRI) { return; int SrcNum; - const int Ops[] = { AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 }; + const AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2}; for (SrcNum = 0; SrcNum < 3 && AMDGPU::hasNamedOperand(Opc, Ops[SrcNum]); ++SrcNum) ; @@ -8815,12 +8828,11 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands) if (OpSelIdx == -1) return; - const int Ops[] = { AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 }; - const int ModOps[] = { AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers, - AMDGPU::OpName::src2_modifiers }; + const AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2}; + const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); @@ -8956,12 +8968,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, if (NegHiIdx != -1) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi); - const int Ops[] = { AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 }; - const int ModOps[] = { AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers, - AMDGPU::OpName::src2_modifiers }; + const AMDGPU::OpName Ops[] = {AMDGPU::OpName::src0, AMDGPU::OpName::src1, + AMDGPU::OpName::src2}; + const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; unsigned OpSel = 0; unsigned OpSelHi = 0; @@ -9024,7 +9035,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { } static void addSrcModifiersAndSrc(MCInst &Inst, const OperandVector &Operands, - unsigned i, unsigned Opc, unsigned OpName) { + unsigned i, unsigned Opc, + AMDGPU::OpName OpName) { if (AMDGPU::getNamedOperandIdx(Opc, OpName) != -1) ((AMDGPUOperand &)*Operands[i]).addRegOrImmWithFPInputModsOperands(Inst, 2); else diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 58cdbe6cf373e..308ab8e3b82c4 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -75,8 +75,8 @@ addOperand(MCInst &Inst, const MCOperand& Opnd) { } static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op, - uint16_t NameIdx) { - int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx); + AMDGPU::OpName Name) { + int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), Name); if (OpIdx != -1) { auto *I = MI.begin(); std::advance(I, OpIdx); @@ -423,10 +423,11 @@ static DecodeStatus decodeAVLdSt(MCInst &Inst, unsigned Imm, // are also tied. unsigned Opc = Inst.getOpcode(); uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags; - uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 - : AMDGPU::OpName::vdata; + AMDGPU::OpName DataName = (TSFlags & SIInstrFlags::DS) + ? AMDGPU::OpName::data0 + : AMDGPU::OpName::vdata; const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo(); - int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx); + int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataName); if ((int)Inst.getNumOperands() == DataIdx) { int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); if (IsAGPROperand(Inst, DstIdx, MRI)) @@ -922,9 +923,9 @@ static VOPModifiers collectVOPModifiers(const MCInst &MI, bool IsVOP3P = false) { VOPModifiers Modifiers; unsigned Opc = MI.getOpcode(); - const int ModOps[] = {AMDGPU::OpName::src0_modifiers, - AMDGPU::OpName::src1_modifiers, - AMDGPU::OpName::src2_modifiers}; + const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src1_modifiers, + AMDGPU::OpName::src2_modifiers}; for (int J = 0; J < 3; ++J) { int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); if (OpIdx == -1) @@ -951,15 +952,15 @@ void AMDGPUDisassembler::convertTrue16OpSel(MCInst &MI) const { const unsigned Opc = MI.getOpcode(); const MCRegisterClass &ConversionRC = MRI.getRegClass(AMDGPU::VGPR_16RegClassID); - constexpr std::array, 4> OpAndOpMods = { - {{AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers, - SISrcMods::OP_SEL_0}, - {AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers, - SISrcMods::OP_SEL_0}, - {AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers, - SISrcMods::OP_SEL_0}, - {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers, - SISrcMods::DST_OP_SEL}}}; + constexpr std::array, 4> + OpAndOpMods = {{{AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers, + SISrcMods::OP_SEL_0}, + {AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers, + SISrcMods::OP_SEL_0}, + {AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers, + SISrcMods::OP_SEL_0}, + {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers, + SISrcMods::DST_OP_SEL}}}; for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) { int OpIdx = AMDGPU::getNamedOperandIdx(Opc, OpName); int OpModsIdx = AMDGPU::getNamedOperandIdx(Opc, OpModsName); @@ -1069,8 +1070,9 @@ void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { AMDGPU::OpName::vdata); int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); - int RsrcOpName = (TSFlags & SIInstrFlags::MIMG) ? AMDGPU::OpName::srsrc - : AMDGPU::OpName::rsrc; + AMDGPU::OpName RsrcOpName = (TSFlags & SIInstrFlags::MIMG) + ? AMDGPU::OpName::srsrc + : AMDGPU::OpName::rsrc; int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), RsrcOpName); int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dmask); @@ -2233,15 +2235,15 @@ Expected AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3( // Bits [4-11]. if (isGFX11()) { - PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE", - COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE); + PRINT_DIRECTIVE(".amdhsa_inst_pref_size", + COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE); PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START", COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_START); PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END", COMPUTE_PGM_RSRC3_GFX11_TRAP_ON_END); } else if (isGFX12Plus()) { - PRINT_PSEUDO_DIRECTIVE_COMMENT( - "INST_PREF_SIZE", COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE); + PRINT_DIRECTIVE(".amdhsa_inst_pref_size", + COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE); } else { CHECK_RESERVED_BITS_DESC_MSG(COMPUTE_PGM_RSRC3_GFX10_RESERVED1, "COMPUTE_PGM_RSRC3", diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index cc802b5fbb67c..b22babb4a00d8 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -70,9 +70,7 @@ class GCNDPPCombine { RegSubRegPair CombOldVGPR, bool CombBCZ, bool IsShrinkable) const; - bool hasNoImmOrEqual(MachineInstr &MI, - unsigned OpndName, - int64_t Value, + bool hasNoImmOrEqual(MachineInstr &MI, AMDGPU::OpName OpndName, int64_t Value, int64_t Mask = -1) const; bool combineDPPMov(MachineInstr &MI) const; @@ -513,7 +511,7 @@ MachineInstr *GCNDPPCombine::createDPPInst( // returns true if MI doesn't have OpndName immediate operand or the // operand has Value -bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, +bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, AMDGPU::OpName OpndName, int64_t Value, int64_t Mask) const { auto *Imm = TII->getNamedOperand(MI, OpndName); if (!Imm) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 73b44680aad5d..827598078af53 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1310,7 +1310,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { if (!SIInstrInfo::isVALU(*MI)) return false; - unsigned SDSTName; + AMDGPU::OpName SDSTName; switch (MI->getOpcode()) { case AMDGPU::V_READLANE_B32: case AMDGPU::V_READFIRSTLANE_B32: @@ -2598,20 +2598,24 @@ static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { return NumPasses + 2; } -static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { - // 2 pass -> 5 - // 4 pass -> 7 - // 8 pass -> 11 - // 16 pass -> 19 - return NumPasses + 3; +static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, + bool IsGFX950) { + // xdl def cycles | gfx940 | gfx950 + // 2 pass | 5 5 + // 4 pass | 7 8 + // 8 pass | 11 12 + // 16 pass | 19 20 + return NumPasses + 3 + (NumPasses != 2 && IsGFX950); } -static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { - // 2 pass -> 5 - // 4 pass -> 7 - // 8 pass -> 11 - // 16 pass -> 19 - return NumPasses + 3; +static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, + bool IsGFX950) { + // xdl def cycles | gfx940 | gfx950 + // 2 pass | 5 5 + // 4 pass | 7 8 + // 8 pass | 11 12 + // 16 pass | 19 20 + return NumPasses + 3 + (NumPasses != 2 && IsGFX950); } static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { @@ -2762,7 +2766,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { } else if (ST.hasGFX940Insts()) { NeedWaitStates = isXDL(ST, *MFMA) - ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) + ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates( + NumPasses, ST.hasGFX950Insts()) : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( NumPasses); } else { @@ -2848,7 +2853,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { } else if (ST.hasGFX940Insts()) { NeedWaitStates = isXDL(ST, *MFMA) - ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses) + ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates( + NumPasses, ST.hasGFX950Insts()) : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses); } else { switch (NumPasses) { diff --git a/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp index 077ccf36ea4fb..c58d1b00a1002 100644 --- a/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp @@ -28,6 +28,7 @@ /// calculation and creates more possibilities for the code unaware of lanemasks //===----------------------------------------------------------------------===// +#include "GCNRewritePartialRegUses.h" #include "AMDGPU.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIRegisterInfo.h" @@ -44,25 +45,7 @@ using namespace llvm; namespace { -class GCNRewritePartialRegUses : public MachineFunctionPass { -public: - static char ID; - GCNRewritePartialRegUses() : MachineFunctionPass(ID) {} - - StringRef getPassName() const override { - return "Rewrite Partial Register Uses"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addPreserved(); - AU.addPreserved(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - -private: +class GCNRewritePartialRegUsesImpl { MachineRegisterInfo *MRI; const SIRegisterInfo *TRI; const TargetInstrInfo *TII; @@ -155,13 +138,36 @@ class GCNRewritePartialRegUses : public MachineFunctionPass { /// Cache for getAllocatableAndAlignedRegClassMask method: /// AlignNumBits -> Class bitmask. mutable SmallDenseMap AllocatableAndAlignedRegClassMasks; + +public: + GCNRewritePartialRegUsesImpl(LiveIntervals *LS) : LIS(LS) {} + bool run(MachineFunction &MF); +}; + +class GCNRewritePartialRegUsesLegacy : public MachineFunctionPass { +public: + static char ID; + GCNRewritePartialRegUsesLegacy() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "Rewrite Partial Register Uses"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; }; } // end anonymous namespace // TODO: move this to the tablegen and use binary search by Offset. -unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset, - unsigned Size) const { +unsigned GCNRewritePartialRegUsesImpl::getSubReg(unsigned Offset, + unsigned Size) const { const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0); if (Inserted) { for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) { @@ -175,15 +181,14 @@ unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset, return I->second; } -unsigned GCNRewritePartialRegUses::shiftSubReg(unsigned SubReg, - unsigned RShift) const { +unsigned GCNRewritePartialRegUsesImpl::shiftSubReg(unsigned SubReg, + unsigned RShift) const { unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift; return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg)); } -const uint32_t * -GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC, - unsigned SubRegIdx) const { +const uint32_t *GCNRewritePartialRegUsesImpl::getSuperRegClassMask( + const TargetRegisterClass *RC, unsigned SubRegIdx) const { const auto [I, Inserted] = SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr); if (Inserted) { @@ -197,7 +202,8 @@ GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC, return I->second; } -const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask( +const BitVector & +GCNRewritePartialRegUsesImpl::getAllocatableAndAlignedRegClassMask( unsigned AlignNumBits) const { const auto [I, Inserted] = AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits); @@ -214,7 +220,7 @@ const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask( } const TargetRegisterClass * -GCNRewritePartialRegUses::getRegClassWithShiftedSubregs( +GCNRewritePartialRegUsesImpl::getRegClassWithShiftedSubregs( const TargetRegisterClass *RC, unsigned RShift, unsigned RegNumBits, unsigned CoverSubregIdx, SubRegMap &SubRegs) const { @@ -289,8 +295,8 @@ GCNRewritePartialRegUses::getRegClassWithShiftedSubregs( } const TargetRegisterClass * -GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC, - SubRegMap &SubRegs) const { +GCNRewritePartialRegUsesImpl::getMinSizeReg(const TargetRegisterClass *RC, + SubRegMap &SubRegs) const { unsigned CoverSubreg = AMDGPU::NoSubRegister; unsigned Offset = std::numeric_limits::max(); unsigned End = 0; @@ -343,9 +349,8 @@ GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC, // Only the subrange's lanemasks of the original interval need to be modified. // Subrange for a covering subreg becomes the main range. -void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg, - Register NewReg, - SubRegMap &SubRegs) const { +void GCNRewritePartialRegUsesImpl::updateLiveIntervals( + Register OldReg, Register NewReg, SubRegMap &SubRegs) const { if (!LIS->hasInterval(OldReg)) return; @@ -400,13 +405,13 @@ void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg, } const TargetRegisterClass * -GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const { +GCNRewritePartialRegUsesImpl::getOperandRegClass(MachineOperand &MO) const { MachineInstr *MI = MO.getParent(); return TII->getRegClass(TII->get(MI->getOpcode()), MI->getOperandNo(&MO), TRI, *MI->getParent()->getParent()); } -bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const { +bool GCNRewritePartialRegUsesImpl::rewriteReg(Register Reg) const { auto Range = MRI->reg_nodbg_operands(Reg); if (Range.empty() || any_of(Range, [](MachineOperand &MO) { return MO.getSubReg() == AMDGPU::NoSubRegister; // Whole reg used. [1] @@ -476,12 +481,10 @@ bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const { return true; } -bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) { +bool GCNRewritePartialRegUsesImpl::run(MachineFunction &MF) { MRI = &MF.getRegInfo(); TRI = static_cast(MRI->getTargetRegisterInfo()); TII = MF.getSubtarget().getInstrInfo(); - auto *LISWrapper = getAnalysisIfAvailable(); - LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; bool Changed = false; for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) { Changed |= rewriteReg(Register::index2VirtReg(I)); @@ -489,11 +492,33 @@ bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) { return Changed; } -char GCNRewritePartialRegUses::ID; +bool GCNRewritePartialRegUsesLegacy::runOnMachineFunction(MachineFunction &MF) { + LiveIntervalsWrapperPass *LISWrapper = + getAnalysisIfAvailable(); + LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; + GCNRewritePartialRegUsesImpl Impl(LIS); + return Impl.run(MF); +} + +PreservedAnalyses +GCNRewritePartialRegUsesPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + auto *LIS = MFAM.getCachedResult(MF); + if (!GCNRewritePartialRegUsesImpl(LIS).run(MF)) + return PreservedAnalyses::all(); + + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet(); + PA.preserve(); + PA.preserve(); + return PA; +} + +char GCNRewritePartialRegUsesLegacy::ID; -char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUses::ID; +char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUsesLegacy::ID; -INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUses, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUsesLegacy, DEBUG_TYPE, "Rewrite Partial Register Uses", false, false) -INITIALIZE_PASS_END(GCNRewritePartialRegUses, DEBUG_TYPE, +INITIALIZE_PASS_END(GCNRewritePartialRegUsesLegacy, DEBUG_TYPE, "Rewrite Partial Register Uses", false, false) diff --git a/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.h b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.h new file mode 100644 index 0000000000000..b2c3190b5c6ba --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.h @@ -0,0 +1,23 @@ +//===- GCNRewritePartialRegUses.h -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREWRITEPARTIALREGUSES_H +#define LLVM_LIB_TARGET_AMDGPU_GCNREWRITEPARTIALREGUSES_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { +class GCNRewritePartialRegUsesPass + : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNREWRITEPARTIALREGUSES_H diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index ed9c48ff9c4de..d0043bcc920b6 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -308,7 +308,7 @@ bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { // taken from SIInstrInfo::hasModifiersSet() bool AMDGPUCustomBehaviour::hasModifiersSet( - const std::unique_ptr &Inst, unsigned OpName) const { + const std::unique_ptr &Inst, AMDGPU::OpName OpName) const { int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName); if (Idx == -1) return false; diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h index 3a231758887ba..85b9c188b5d1a 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h @@ -17,6 +17,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCA_AMDGPUCUSTOMBEHAVIOUR_H #define LLVM_LIB_TARGET_AMDGPU_MCA_AMDGPUCUSTOMBEHAVIOUR_H +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/MCA/CustomBehaviour.h" #include "llvm/TargetParser/TargetParser.h" @@ -66,7 +67,7 @@ class AMDGPUCustomBehaviour : public CustomBehaviour { void generateWaitCntInfo(); /// Helper function used in generateWaitCntInfo() bool hasModifiersSet(const std::unique_ptr &Inst, - unsigned OpName) const; + AMDGPU::OpName OpName) const; /// Helper function used in generateWaitCntInfo() bool isGWS(uint16_t Opcode) const; /// Helper function used in generateWaitCntInfo() diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index c389f3a13d952..381841f142855 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1205,7 +1205,7 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, int NumOps = 0; int Ops[3]; - std::pair MOps[] = { + std::pair MOps[] = { {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src0}, {AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src1}, {AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::src2}}; @@ -1226,7 +1226,7 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsWMMA) { NumOps = 0; int DefaultValue = Mod == SISrcMods::OP_SEL_1; - for (int OpName : + for (AMDGPU::OpName OpName : {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers}) { int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index b9a424bb1d059..1391ef6dd09e5 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -340,14 +340,13 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO, uint64_t AMDGPUMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const { using namespace AMDGPU::VOP3PEncoding; - using namespace AMDGPU::OpName; - if (AMDGPU::hasNamedOperand(Opcode, op_sel_hi)) { - if (AMDGPU::hasNamedOperand(Opcode, src2)) + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel_hi)) { + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src2)) return 0; - if (AMDGPU::hasNamedOperand(Opcode, src1)) + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1)) return OP_SEL_HI_2; - if (AMDGPU::hasNamedOperand(Opcode, src0)) + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0)) return OP_SEL_HI_1 | OP_SEL_HI_2; } return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 879dbe1b279b1..9c0b2da0fcb0a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -50,7 +50,6 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, #include "AMDGPUGenRegisterInfo.inc" #define GET_INSTRINFO_ENUM -#define GET_INSTRINFO_OPERAND_ENUM #define GET_INSTRINFO_MC_HELPER_DECLS #include "AMDGPUGenInstrInfo.inc" diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index eccd77d6c00f0..059bab5838526 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -579,7 +579,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( amdhsa::COMPUTE_PGM_RSRC3_GFX10_GFX11_SHARED_VGPR_COUNT, ".amdhsa_shared_vgpr_count"); } + if (IVersion.Major == 11) { + PrintField(KD.compute_pgm_rsrc3, + amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT, + amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE, + ".amdhsa_inst_pref_size"); + } if (IVersion.Major >= 12) { + PrintField(KD.compute_pgm_rsrc3, + amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT, + amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE, + ".amdhsa_inst_pref_size"); PrintField(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN_SHIFT, amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h index cf40e7eccb5d2..20f2cb826ac4b 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h @@ -32,7 +32,6 @@ MCInstrInfo *createR600MCInstrInfo(); #include "R600GenRegisterInfo.inc" #define GET_INSTRINFO_ENUM -#define GET_INSTRINFO_OPERAND_ENUM #define GET_INSTRINFO_SCHED_ENUM #define GET_INSTRINFO_MC_HELPER_DECLS #include "R600GenInstrInfo.inc" diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index ef2d049f91752..429ce0e0857ac 100644 --- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -31,7 +31,7 @@ class R600ExpandSpecialInstrsPass : public MachineFunctionPass { const R600InstrInfo *TII = nullptr; void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI, - unsigned Op); + R600::OpName Op); public: static char ID; @@ -61,7 +61,8 @@ FunctionPass *llvm::createR600ExpandSpecialInstrsPass() { } void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, - const MachineInstr *OldMI, unsigned Op) { + const MachineInstr *OldMI, + R600::OpName Op) { int OpIdx = TII->getOperandIdx(*OldMI, Op); if (OpIdx > -1) { uint64_t Val = OldMI->getOperand(OpIdx).getImm(); diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index f95649db2942e..1c4a992c87271 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -222,19 +222,18 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { } int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { - static const unsigned SrcSelTable[][2] = { - {R600::OpName::src0, R600::OpName::src0_sel}, - {R600::OpName::src1, R600::OpName::src1_sel}, - {R600::OpName::src2, R600::OpName::src2_sel}, - {R600::OpName::src0_X, R600::OpName::src0_sel_X}, - {R600::OpName::src0_Y, R600::OpName::src0_sel_Y}, - {R600::OpName::src0_Z, R600::OpName::src0_sel_Z}, - {R600::OpName::src0_W, R600::OpName::src0_sel_W}, - {R600::OpName::src1_X, R600::OpName::src1_sel_X}, - {R600::OpName::src1_Y, R600::OpName::src1_sel_Y}, - {R600::OpName::src1_Z, R600::OpName::src1_sel_Z}, - {R600::OpName::src1_W, R600::OpName::src1_sel_W} - }; + static const R600::OpName SrcSelTable[][2] = { + {R600::OpName::src0, R600::OpName::src0_sel}, + {R600::OpName::src1, R600::OpName::src1_sel}, + {R600::OpName::src2, R600::OpName::src2_sel}, + {R600::OpName::src0_X, R600::OpName::src0_sel_X}, + {R600::OpName::src0_Y, R600::OpName::src0_sel_Y}, + {R600::OpName::src0_Z, R600::OpName::src0_sel_Z}, + {R600::OpName::src0_W, R600::OpName::src0_sel_W}, + {R600::OpName::src1_X, R600::OpName::src1_sel_X}, + {R600::OpName::src1_Y, R600::OpName::src1_sel_Y}, + {R600::OpName::src1_Z, R600::OpName::src1_sel_Z}, + {R600::OpName::src1_W, R600::OpName::src1_sel_W}}; for (const auto &Row : SrcSelTable) { if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) { @@ -249,15 +248,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { SmallVector, 3> Result; if (MI.getOpcode() == R600::DOT_4) { - static const unsigned OpTable[8][2] = { - {R600::OpName::src0_X, R600::OpName::src0_sel_X}, - {R600::OpName::src0_Y, R600::OpName::src0_sel_Y}, - {R600::OpName::src0_Z, R600::OpName::src0_sel_Z}, - {R600::OpName::src0_W, R600::OpName::src0_sel_W}, - {R600::OpName::src1_X, R600::OpName::src1_sel_X}, - {R600::OpName::src1_Y, R600::OpName::src1_sel_Y}, - {R600::OpName::src1_Z, R600::OpName::src1_sel_Z}, - {R600::OpName::src1_W, R600::OpName::src1_sel_W}, + static const R600::OpName OpTable[8][2] = { + {R600::OpName::src0_X, R600::OpName::src0_sel_X}, + {R600::OpName::src0_Y, R600::OpName::src0_sel_Y}, + {R600::OpName::src0_Z, R600::OpName::src0_sel_Z}, + {R600::OpName::src0_W, R600::OpName::src0_sel_W}, + {R600::OpName::src1_X, R600::OpName::src1_sel_X}, + {R600::OpName::src1_Y, R600::OpName::src1_sel_Y}, + {R600::OpName::src1_Z, R600::OpName::src1_sel_Z}, + {R600::OpName::src1_W, R600::OpName::src1_sel_W}, }; for (const auto &Op : OpTable) { @@ -273,10 +272,10 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { return Result; } - static const unsigned OpTable[3][2] = { - {R600::OpName::src0, R600::OpName::src0_sel}, - {R600::OpName::src1, R600::OpName::src1_sel}, - {R600::OpName::src2, R600::OpName::src2_sel}, + static const R600::OpName OpTable[3][2] = { + {R600::OpName::src0, R600::OpName::src0_sel}, + {R600::OpName::src1, R600::OpName::src1_sel}, + {R600::OpName::src2, R600::OpName::src2_sel}, }; for (const auto &Op : OpTable) { @@ -1238,19 +1237,14 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB return MIB; } -#define OPERAND_CASE(Label) \ - case Label: { \ - static const unsigned Ops[] = \ - { \ - Label##_X, \ - Label##_Y, \ - Label##_Z, \ - Label##_W \ - }; \ - return Ops[Slot]; \ +#define OPERAND_CASE(Label) \ + case Label: { \ + static const R600::OpName Ops[] = {Label##_X, Label##_Y, Label##_Z, \ + Label##_W}; \ + return Ops[Slot]; \ } -static unsigned getSlotedOps(unsigned Op, unsigned Slot) { +static R600::OpName getSlotedOps(R600::OpName Op, unsigned Slot) { switch (Op) { OPERAND_CASE(R600::OpName::update_exec_mask) OPERAND_CASE(R600::OpName::update_pred) @@ -1292,21 +1286,21 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src1, Slot))); MachineInstr *MIB = buildDefaultInstruction( MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); - static const unsigned Operands[14] = { - R600::OpName::update_exec_mask, - R600::OpName::update_pred, - R600::OpName::write, - R600::OpName::omod, - R600::OpName::dst_rel, - R600::OpName::clamp, - R600::OpName::src0_neg, - R600::OpName::src0_rel, - R600::OpName::src0_abs, - R600::OpName::src0_sel, - R600::OpName::src1_neg, - R600::OpName::src1_rel, - R600::OpName::src1_abs, - R600::OpName::src1_sel, + static const R600::OpName Operands[14] = { + R600::OpName::update_exec_mask, + R600::OpName::update_pred, + R600::OpName::write, + R600::OpName::omod, + R600::OpName::dst_rel, + R600::OpName::clamp, + R600::OpName::src0_neg, + R600::OpName::src0_rel, + R600::OpName::src0_abs, + R600::OpName::src0_sel, + R600::OpName::src1_neg, + R600::OpName::src1_rel, + R600::OpName::src1_abs, + R600::OpName::src1_sel, }; MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), @@ -1314,7 +1308,7 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel)) .setReg(MO.getReg()); - for (unsigned Operand : Operands) { + for (R600::OpName Operand : Operands) { MachineOperand &MO = MI->getOperand( getOperandIdx(MI->getOpcode(), getSlotedOps(Operand, Slot))); assert (MO.isImm()); @@ -1340,15 +1334,16 @@ MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB, return buildDefaultInstruction(*MBB, I, R600::MOV, DstReg, SrcReg); } -int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const { +int R600InstrInfo::getOperandIdx(const MachineInstr &MI, + R600::OpName Op) const { return getOperandIdx(MI.getOpcode(), Op); } -int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const { +int R600InstrInfo::getOperandIdx(unsigned Opcode, R600::OpName Op) const { return R600::getNamedOperandIdx(Opcode, Op); } -void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op, +void R600InstrInfo::setImmOperand(MachineInstr &MI, R600::OpName Op, int64_t Imm) const { int Idx = getOperandIdx(MI, Op); assert(Idx != -1 && "Operand not supported for this instruction."); @@ -1425,37 +1420,37 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx, return FlagOp; } -void R600InstrInfo::addFlag(MachineInstr &MI, unsigned Operand, +void R600InstrInfo::addFlag(MachineInstr &MI, unsigned SrcIdx, unsigned Flag) const { unsigned TargetFlags = get(MI.getOpcode()).TSFlags; if (Flag == 0) { return; } if (HAS_NATIVE_OPERANDS(TargetFlags)) { - MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); + MachineOperand &FlagOp = getFlagOp(MI, SrcIdx, Flag); if (Flag == MO_FLAG_NOT_LAST) { - clearFlag(MI, Operand, MO_FLAG_LAST); + clearFlag(MI, SrcIdx, MO_FLAG_LAST); } else if (Flag == MO_FLAG_MASK) { - clearFlag(MI, Operand, Flag); + clearFlag(MI, SrcIdx, Flag); } else { FlagOp.setImm(1); } } else { - MachineOperand &FlagOp = getFlagOp(MI, Operand); - FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand))); + MachineOperand &FlagOp = getFlagOp(MI, SrcIdx); + FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * SrcIdx))); } } -void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand, +void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned SrcIdx, unsigned Flag) const { unsigned TargetFlags = get(MI.getOpcode()).TSFlags; if (HAS_NATIVE_OPERANDS(TargetFlags)) { - MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); + MachineOperand &FlagOp = getFlagOp(MI, SrcIdx, Flag); FlagOp.setImm(0); } else { MachineOperand &FlagOp = getFlagOp(MI); unsigned InstFlags = FlagOp.getImm(); - InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand)); + InstFlags &= ~(Flag << (NUM_MO_FLAGS * SrcIdx)); FlagOp.setImm(InstFlags); } } diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h index c767ecb24590b..a403c65141fed 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -18,6 +18,7 @@ #include "llvm/CodeGen/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER +#define GET_INSTRINFO_OPERAND_ENUM #include "R600GenInstrInfo.inc" namespace llvm { @@ -287,21 +288,21 @@ class R600InstrInfo final : public R600GenInstrInfo { /// Get the index of Op in the MachineInstr. /// /// \returns -1 if the Instruction does not contain the specified \p Op. - int getOperandIdx(const MachineInstr &MI, unsigned Op) const; + int getOperandIdx(const MachineInstr &MI, R600::OpName Op) const; /// Get the index of \p Op for the given Opcode. /// /// \returns -1 if the Instruction does not contain the specified \p Op. - int getOperandIdx(unsigned Opcode, unsigned Op) const; + int getOperandIdx(unsigned Opcode, R600::OpName Op) const; /// Helper function for setting instruction flag values. - void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const; + void setImmOperand(MachineInstr &MI, R600::OpName Op, int64_t Imm) const; - ///Add one of the MO_FLAG* flags to the specified \p Operand. - void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const; + /// Add one of the MO_FLAG* flags to the operand at \p SrcIdx. + void addFlag(MachineInstr &MI, unsigned SrcIdx, unsigned Flag) const; - ///Determine if the specified \p Flag is set on this \p Operand. - bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; + /// Determine if the specified \p Flag is set on operand at \p SrcIdx. + bool isFlagSet(const MachineInstr &MI, unsigned SrcIdx, unsigned Flag) const; /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2) /// \param Flag The flag being set. @@ -311,7 +312,7 @@ class R600InstrInfo final : public R600GenInstrInfo { unsigned Flag = 0) const; /// Clear the specified flag on the instruction. - void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const; + void clearFlag(MachineInstr &MI, unsigned SrcIdx, unsigned Flag) const; // Helper functions that check the opcode for status information bool isRegisterStore(const MachineInstr &MI) const { diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index 28bf6e33384d2..c1ed176ed29d2 100644 --- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -122,12 +122,9 @@ class R600PacketizerList : public VLIWPacketizerList { void substitutePV(MachineInstr &MI, const DenseMap &PVs) const { - unsigned Ops[] = { - R600::OpName::src0, - R600::OpName::src1, - R600::OpName::src2 - }; - for (unsigned Op : Ops) { + const R600::OpName Ops[] = {R600::OpName::src0, R600::OpName::src1, + R600::OpName::src2}; + for (R600::OpName Op : Ops) { int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Op); if (OperandIdx < 0) continue; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2bc19137b1ca0..d8f3f9c54abc1 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -369,20 +369,20 @@ bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold) const { // Refer to op_sel/op_sel_hi and check if we can change the immediate and // op_sel in a way that allows an inline constant. - int ModIdx = -1; + AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES; unsigned SrcIdx = ~0; if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) { - ModIdx = AMDGPU::OpName::src0_modifiers; + ModName = AMDGPU::OpName::src0_modifiers; SrcIdx = 0; } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) { - ModIdx = AMDGPU::OpName::src1_modifiers; + ModName = AMDGPU::OpName::src1_modifiers; SrcIdx = 1; } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) { - ModIdx = AMDGPU::OpName::src2_modifiers; + ModName = AMDGPU::OpName::src2_modifiers; SrcIdx = 2; } - assert(ModIdx != -1); - ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); + assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES); + int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName); MachineOperand &Mod = MI->getOperand(ModIdx); unsigned ModVal = Mod.getImm(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b632c50dae0e3..28debbcfc1ede 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8619,6 +8619,11 @@ SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, if (MaxID == 0) return DAG.getConstant(0, SL, MVT::i32); + // It's undefined behavior if a function marked with the amdgpu-no-* + // attributes uses the corresponding intrinsic. + if (!Arg) + return DAG.getUNDEF(Op->getValueType(0)); + SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), Arg); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 3d6419778f4b1..ee263f58bcaf2 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1139,7 +1139,7 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() { return new SIInsertWaitcnts(); } -static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, +static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc) { int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); assert(OpIdx >= 0); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 35667801c809d..baacb5d3d5455 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -79,7 +79,8 @@ static unsigned getNumOperandsNoGlue(SDNode *Node) { /// Returns true if both nodes have the same value for the given /// operand \p Op, or if both nodes do not have this operand. -static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { +static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, + AMDGPU::OpName OpName) { unsigned Opc0 = N0->getMachineOpcode(); unsigned Opc1 = N1->getMachineOpcode(); @@ -2701,11 +2702,10 @@ SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { return std::nullopt; } -bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, - MachineOperand &Src0, - unsigned Src0OpName, +bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, + AMDGPU::OpName Src0OpName, MachineOperand &Src1, - unsigned Src1OpName) const { + AMDGPU::OpName Src1OpName) const { MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); if (!Src0Mods) return false; @@ -3432,14 +3432,14 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { } } -static constexpr unsigned ModifierOpNames[] = { +static constexpr AMDGPU::OpName ModifierOpNames[] = { AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp, AMDGPU::OpName::omod, AMDGPU::OpName::op_sel}; void SIInstrInfo::removeModOperands(MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); - for (unsigned Name : reverse(ModifierOpNames)) { + for (AMDGPU::OpName Name : reverse(ModifierOpNames)) { int Idx = AMDGPU::getNamedOperandIdx(Opc, Name); if (Idx >= 0) MI.removeOperand(Idx); @@ -4494,14 +4494,14 @@ bool SIInstrInfo::hasModifiers(unsigned Opcode) const { } bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, - unsigned OpName) const { + AMDGPU::OpName OpName) const { const MachineOperand *Mods = getNamedOperand(MI, OpName); return Mods && Mods->getImm(); } bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { return any_of(ModifierOpNames, - [&](unsigned Name) { return hasModifiersSet(MI, Name); }); + [&](AMDGPU::OpName Name) { return hasModifiersSet(MI, Name); }); } bool SIInstrInfo::canShrink(const MachineInstr &MI, @@ -5268,7 +5268,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (DimOp) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vaddr0); - int RSrcOpName = + AMDGPU::OpName RSrcOpName = isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName); const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); @@ -5373,9 +5373,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); - uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 - : AMDGPU::OpName::vdata; - const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); + AMDGPU::OpName DataName = + isDS(Opcode) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata; + const MachineOperand *Data = getNamedOperand(MI, DataName); const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); if (Data && !Data->isReg()) Data = nullptr; @@ -5405,7 +5405,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } if (ST.needsAlignedVGPRs()) { - const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool { + const auto isAlignedReg = [&MI, &MRI, this](AMDGPU::OpName OpName) -> bool { const MachineOperand *Op = getNamedOperand(MI, OpName); if (!Op) return true; @@ -6890,13 +6890,15 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, // conversion to the addr64 form. if (isImage(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && (isMUBUF(MI) || isMTBUF(MI)))) { - int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc - : AMDGPU::OpName::srsrc; + AMDGPU::OpName RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) + ? AMDGPU::OpName::rsrc + : AMDGPU::OpName::srsrc; MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName); if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT); - int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp; + AMDGPU::OpName SampOpName = + isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp; MachineOperand *SSamp = getNamedOperand(MI, SampOpName); if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT); @@ -8792,7 +8794,7 @@ Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, } MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, - unsigned OperandName) const { + AMDGPU::OpName OperandName) const { int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); if (Idx == -1) return nullptr; @@ -10109,7 +10111,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, } void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI, - unsigned OpName) const { + AMDGPU::OpName OpName) const { if (!ST.needsAlignedVGPRs()) return; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 933935a86f9f9..6b0de138251ab 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -190,9 +190,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { std::optional isCopyInstrImpl(const MachineInstr &MI) const override; - bool swapSourceModifiers(MachineInstr &MI, - MachineOperand &Src0, unsigned Src0OpName, - MachineOperand &Src1, unsigned Src1OpName) const; + bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, + AMDGPU::OpName Src0OpName, MachineOperand &Src1, + AMDGPU::OpName Src1OpName) const; bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, const MachineOperand *fromMO, unsigned toIdx, const MachineOperand *toMO) const; @@ -1137,8 +1137,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// e.g. src[012]_mod, omod, clamp. bool hasModifiers(unsigned Opcode) const; - bool hasModifiersSet(const MachineInstr &MI, - unsigned OpName) const; + bool hasModifiersSet(const MachineInstr &MI, AMDGPU::OpName OpName) const; bool hasAnyModifiersSet(const MachineInstr &MI) const; bool canShrink(const MachineInstr &MI, @@ -1294,17 +1293,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. LLVM_READONLY - MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; + MachineOperand *getNamedOperand(MachineInstr &MI, + AMDGPU::OpName OperandName) const; LLVM_READONLY const MachineOperand *getNamedOperand(const MachineInstr &MI, - unsigned OpName) const { - return getNamedOperand(const_cast(MI), OpName); + AMDGPU::OpName OperandName) const { + return getNamedOperand(const_cast(MI), OperandName); } /// Get required immediate operand - int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const { - int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); + int64_t getNamedImmOperand(const MachineInstr &MI, + AMDGPU::OpName OperandName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); return MI.getOperand(Idx).getImm(); } @@ -1461,7 +1462,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { // Enforce operand's \p OpName even alignment if required by target. // This is used if an operand is a 32 bit register but needs to be aligned // regardless. - void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const; + void enforceOperandRCAlignment(MachineInstr &MI, AMDGPU::OpName OpName) const; }; /// \brief Returns true if a reg:subreg pair P has a TRC class diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 708acc9f88445..39359d24cff0c 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -228,11 +228,11 @@ class SILoadStoreOptimizer { CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, - MachineBasicBlock::iterator InsertBefore, int OpName, - Register DestReg) const; + MachineBasicBlock::iterator InsertBefore, + AMDGPU::OpName OpName, Register DestReg) const; Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, - int OpName) const; + AMDGPU::OpName OpName) const; unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; @@ -699,7 +699,7 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { if (TII.isImage(Opc)) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); if (VAddr0Idx >= 0) { - int RsrcName = + AMDGPU::OpName RsrcName = TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); Result.NumVAddrs = RsrcIdx - VAddr0Idx; @@ -968,11 +968,11 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, return false; // Check other optional immediate operands for equality. - unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, - AMDGPU::OpName::unorm, AMDGPU::OpName::da, - AMDGPU::OpName::r128, AMDGPU::OpName::a16}; + AMDGPU::OpName OperandsToMatch[] = { + AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm, + AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16}; - for (auto op : OperandsToMatch) { + for (AMDGPU::OpName op : OperandsToMatch) { int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) return false; @@ -1256,7 +1256,7 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, // Paired. void SILoadStoreOptimizer::copyToDestRegs( CombineInfo &CI, CombineInfo &Paired, - MachineBasicBlock::iterator InsertBefore, int OpName, + MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName, Register DestReg) const { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); @@ -1287,7 +1287,7 @@ void SILoadStoreOptimizer::copyToDestRegs( Register SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, - int OpName) const { + AMDGPU::OpName OpName) const { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 3fb8d5b560496..920c3e11e4718 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -619,7 +619,7 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence( VCmp.getDebugLoc(), TII->get(NewOpcode)); auto TryAddImmediateValueFromNamedOperand = - [&](unsigned OperandName) -> void { + [&](AMDGPU::OpName OperandName) -> void { if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) Builder.addImm(Mod->getImm()); }; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 7c98ccddb5dd5..cd41b5e94902f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -41,49 +41,8 @@ foreach Size = {2...6,8,16} in { //===----------------------------------------------------------------------===// class getSubRegs { - list ret2 = [sub0, sub1]; - list ret3 = [sub0, sub1, sub2]; - list ret4 = [sub0, sub1, sub2, sub3]; - list ret5 = [sub0, sub1, sub2, sub3, sub4]; - list ret6 = [sub0, sub1, sub2, sub3, sub4, sub5]; - list ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6]; - list ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; - list ret9 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8]; - list ret10 = [sub0, sub1, sub2, sub3, - sub4, sub5, sub6, sub7, - sub8, sub9]; - list ret11 = [sub0, sub1, sub2, sub3, - sub4, sub5, sub6, sub7, - sub8, sub9, sub10]; - list ret12 = [sub0, sub1, sub2, sub3, - sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11]; - list ret16 = [sub0, sub1, sub2, sub3, - sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11, - sub12, sub13, sub14, sub15]; - list ret32 = [sub0, sub1, sub2, sub3, - sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11, - sub12, sub13, sub14, sub15, - sub16, sub17, sub18, sub19, - sub20, sub21, sub22, sub23, - sub24, sub25, sub26, sub27, - sub28, sub29, sub30, sub31]; - - list ret = !if(!eq(size, 2), ret2, - !if(!eq(size, 3), ret3, - !if(!eq(size, 4), ret4, - !if(!eq(size, 5), ret5, - !if(!eq(size, 6), ret6, - !if(!eq(size, 7), ret7, - !if(!eq(size, 8), ret8, - !if(!eq(size, 9), ret9, - !if(!eq(size, 10), ret10, - !if(!eq(size, 11), ret11, - !if(!eq(size, 12), ret12, - !if(!eq(size, 16), ret16, - ret32)))))))))))); + list ret = + !foreach(idx, !range(0, size), !cast(sub#idx)); } // Generates list of sequential register tuple names. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 67bebfb3418d5..e458b6b9604b6 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -19,6 +19,10 @@ #include #include +// Pull in OpName enum definition and getNamedOperandIdx() declaration. +#define GET_INSTRINFO_OPERAND_ENUM +#include "AMDGPUGenInstrInfo.inc" + struct amd_kernel_code_t; namespace llvm { @@ -394,10 +398,7 @@ template struct EncodingFields { }; LLVM_READONLY -int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); - -LLVM_READONLY -inline bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx) { +inline bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx) { return getNamedOperandIdx(Opcode, NamedIdx) != -1; } diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 1afd68767cd3b..5e825e7259a95 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -154,10 +154,12 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile { + Instruction mixhi_inst, + bit HasFP32Denormals> { // At least one of the operands needs to be an fpextend of an f16 // for this to be worthwhile, so we need three patterns here. // TODO: Could we use a predicate to inspect src1/2/3 instead? + let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]) in { def : GCNPat < (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)), (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)), @@ -177,6 +179,45 @@ multiclass MadFmaMixPats; + def : GCNPat < + (AMDGPUclamp (build_vector + (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))), + (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))), + (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, + $hi_src1_modifiers, $hi_src1, + $hi_src2_modifiers, $hi_src2, + DSTCLAMP.ENABLE, + (mixlo_inst $lo_src0_modifiers, $lo_src0, + $lo_src1_modifiers, $lo_src1, + $lo_src2_modifiers, $lo_src2, + DSTCLAMP.ENABLE, + (i32 (IMPLICIT_DEF))))) + >; + + def : GCNPat < + (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (mixlo_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + (i32 0), (i32 0), + DSTCLAMP.NONE, + (i32 (IMPLICIT_DEF))) + >; + + def : GCNPat < + (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))), + (v2f16 (mixhi_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + (i32 0), (i32 0), + DSTCLAMP.NONE, + VGPR_32:$elt0)) + >; + def : GCNPat < (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), @@ -187,10 +228,14 @@ multiclass MadFmaMixPats; + } // End OtherPredicates // FIXME: Special case handling for maxhi (especially for clamp) // because dealing with the write to high half of the register is // difficult. + foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in + let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]), True16Predicate = p in { + def : GCNPat < (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), @@ -215,44 +260,44 @@ multiclass MadFmaMixPats; - def : GCNPat < - (AMDGPUclamp (build_vector - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))), - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))), - (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, - $hi_src1_modifiers, $hi_src1, - $hi_src2_modifiers, $hi_src2, - DSTCLAMP.ENABLE, - (mixlo_inst $lo_src0_modifiers, $lo_src0, - $lo_src1_modifiers, $lo_src1, - $lo_src2_modifiers, $lo_src2, - DSTCLAMP.ENABLE, - (i32 (IMPLICIT_DEF))))) - >; + } // end OtherPredicates + let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]), True16Predicate = UseRealTrue16Insts in { def : GCNPat < - (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), - (mixlo_inst $src0_modifiers, $src0, + (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1), + (v2f16 (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, - (i32 0), (i32 0), + $src2_modifiers, $src2, DSTCLAMP.NONE, - (i32 (IMPLICIT_DEF))) + (REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16))) >; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))), + (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), (v2f16 (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, - (i32 0), (i32 0), + $src2_modifiers, $src2, DSTCLAMP.NONE, - VGPR_32:$elt0)) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + >; + + def : GCNPat < + (build_vector + f16:$elt0, + (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), + (v2f16 (mixhi_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.ENABLE, + (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) >; + } // end OtherPredicates } class MinimumMaximumByMinimum3Maximum3VOP3P; def : MinimumMaximumByMinimum3Maximum3VOP3P; } -let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in { +let SubtargetPredicate = HasMadMixInsts in { +let OtherPredicates = [NoFP32Denormals] in { // These are VOP3a-like opcodes which accept no omod. // Size of src arguments (16/32) is controlled by op_sel. @@ -284,9 +330,10 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile; -} // End SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] +defm : MadFmaMixPats; +} // End SubtargetPredicate = HasMadMixInsts // Essentially the same as the mad_mix versions @@ -306,7 +353,7 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile; +defm : MadFmaMixPats; } // Defines patterns that extract signed 4bit from each Idx[0]. diff --git a/llvm/lib/Target/ARM/ARMCallingConv.cpp b/llvm/lib/Target/ARM/ARMCallingConv.cpp index 66a76a8c7a95a..a206e993394f7 100644 --- a/llvm/lib/Target/ARM/ARMCallingConv.cpp +++ b/llvm/lib/Target/ARM/ARMCallingConv.cpp @@ -298,7 +298,8 @@ static bool CustomAssignInRegList(unsigned ValNo, MVT ValVT, MVT LocVT, static bool CC_ARM_AAPCS_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { - // f16 arguments are extended to i32 and assigned to a register in [r0, r3] + // f16 and bf16 arguments are extended to i32 and assigned to a register in + // [r0, r3]. return CustomAssignInRegList(ValNo, ValVT, MVT::i32, LocInfo, State, RRegList); } @@ -307,10 +308,25 @@ static bool CC_ARM_AAPCS_VFP_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { - // f16 arguments are extended to f32 and assigned to a register in [s0, s15] + // f16 and bf16 arguments are extended to f32 and assigned to a register in + // [s0, s15]. return CustomAssignInRegList(ValNo, ValVT, MVT::f32, LocInfo, State, SRegList); } +static bool CC_ARM_AAPCS_Common_Custom_f16_Stack(unsigned ValNo, MVT ValVT, + MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, + CCState &State) { + // f16 and bf16 (if not passed in a register) are assigned to a 32-bit stack + // slot, with the most-significant 16 bits unspecified. The 32-bit slot is + // important to make sure that the byte ordering is correct for big endian + // targets. + State.addLoc(CCValAssign::getCustomMem( + ValNo, ValVT, State.AllocateStack(4, Align(4)), MVT::i32, LocInfo)); + return true; +} + // Include the table generated calling convention implementations. #include "ARMGenCallingConv.inc" diff --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td index 27f175a700336..f1ab1c3103740 100644 --- a/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/llvm/lib/Target/ARM/ARMCallingConv.td @@ -139,7 +139,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[ CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>, CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>, - CCIfType<[f16, bf16, f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>, + CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>, + CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_Common_Custom_f16_Stack">>, CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>, CCIfType<[v2f64], CCIfAlign<"16", CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index bd8d6079e1ba8..2bac1d0086041 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -4759,6 +4759,25 @@ SDValue ARMTargetLowering::LowerFormalArguments( VA.getLocMemOffset(), Flags.getByValSize()); InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); CCInfo.nextInRegsParam(); + } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 || + VA.getValVT() == MVT::bf16)) { + // f16 and bf16 values are passed in the least-significant half of + // a 4 byte stack slot. This is done as-if the extension was done + // in a 32-bit register, so the actual bytes used for the value + // differ between little and big endian. + assert(VA.getLocVT().getSizeInBits() == 32); + unsigned FIOffset = VA.getLocMemOffset(); + int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8, + FIOffset, true); + + SDValue Addr = DAG.getFrameIndex(FI, PtrVT); + if (DAG.getDataLayout().isBigEndian()) + Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2)); + + InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FI))); + } else { unsigned FIOffset = VA.getLocMemOffset(); int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, @@ -8479,7 +8498,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { unsigned EltSize = VT.getScalarSizeInBits(); if (EltSize >= 32 || - ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + ShuffleVectorSDNode::isSplatMask(M) || ShuffleVectorInst::isIdentityMask(M, M.size()) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp index 6a15bac153d85..453aacf49abb3 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp @@ -17,6 +17,8 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/DXILResource.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -96,7 +98,8 @@ void ModuleShaderFlags::updateFunctionFlags(ComputedShaderFlags &CSF, } /// Construct ModuleShaderFlags for module Module M -void ModuleShaderFlags::initialize(Module &M, DXILResourceTypeMap &DRTM) { +void ModuleShaderFlags::initialize(Module &M, DXILResourceTypeMap &DRTM, + const ModuleMetadataInfo &MMDI) { CallGraph CG(M); // Compute Shader Flags Mask for all functions using post-order visit of SCC @@ -142,6 +145,20 @@ void ModuleShaderFlags::initialize(Module &M, DXILResourceTypeMap &DRTM) { // Merge SCCSF with that of F FunctionFlags[F].merge(SCCSF); } + + // Set DisableOptimizations flag based on the presence of OptimizeNone + // attribute of entry functions. + if (MMDI.EntryPropertyVec.size() > 0) { + CombinedSFMask.DisableOptimizations = + MMDI.EntryPropertyVec[0].Entry->hasFnAttribute( + llvm::Attribute::OptimizeNone); + // Ensure all entry functions have the same optimization attribute + for (const auto &EntryFunProps : MMDI.EntryPropertyVec) + if (CombinedSFMask.DisableOptimizations != + EntryFunProps.Entry->hasFnAttribute(llvm::Attribute::OptimizeNone)) + EntryFunProps.Entry->getContext().diagnose(DiagnosticInfoUnsupported( + *(EntryFunProps.Entry), "Inconsistent optnone attribute ")); + } } void ComputedShaderFlags::print(raw_ostream &OS) const { @@ -180,9 +197,10 @@ AnalysisKey ShaderFlagsAnalysis::Key; ModuleShaderFlags ShaderFlagsAnalysis::run(Module &M, ModuleAnalysisManager &AM) { DXILResourceTypeMap &DRTM = AM.getResult(M); + const ModuleMetadataInfo MMDI = AM.getResult(M); ModuleShaderFlags MSFI; - MSFI.initialize(M, DRTM); + MSFI.initialize(M, DRTM, MMDI); return MSFI; } @@ -212,14 +230,17 @@ PreservedAnalyses ShaderFlagsAnalysisPrinter::run(Module &M, bool ShaderFlagsAnalysisWrapper::runOnModule(Module &M) { DXILResourceTypeMap &DRTM = getAnalysis().getResourceTypeMap(); + const ModuleMetadataInfo MMDI = + getAnalysis().getModuleMetadata(); - MSFI.initialize(M, DRTM); + MSFI.initialize(M, DRTM, MMDI); return false; } void ShaderFlagsAnalysisWrapper::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequiredTransitive(); + AU.addRequired(); } char ShaderFlagsAnalysisWrapper::ID = 0; @@ -227,5 +248,6 @@ char ShaderFlagsAnalysisWrapper::ID = 0; INITIALIZE_PASS_BEGIN(ShaderFlagsAnalysisWrapper, "dx-shader-flag-analysis", "DXIL Shader Flag Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) INITIALIZE_PASS_END(ShaderFlagsAnalysisWrapper, "dx-shader-flag-analysis", "DXIL Shader Flag Analysis", true, true) diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h index e6c6d56402c1a..abf7cc86259ed 100644 --- a/llvm/lib/Target/DirectX/DXILShaderFlags.h +++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h @@ -14,6 +14,7 @@ #ifndef LLVM_TARGET_DIRECTX_DXILSHADERFLAGS_H #define LLVM_TARGET_DIRECTX_DXILSHADERFLAGS_H +#include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/IR/Function.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" @@ -83,7 +84,8 @@ struct ComputedShaderFlags { }; struct ModuleShaderFlags { - void initialize(Module &, DXILResourceTypeMap &DRTM); + void initialize(Module &, DXILResourceTypeMap &DRTM, + const ModuleMetadataInfo &MMDI); const ComputedShaderFlags &getFunctionFlags(const Function *) const; const ComputedShaderFlags &getCombinedFlags() const { return CombinedSFMask; } diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 4a80ca81c672b..98b711f6b014b 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -172,7 +172,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonDisassembler() { DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef Bytes, uint64_t Address, - raw_ostream &cs) const { + raw_ostream &CS) const { + CommentStream = &CS; + DecodeStatus Result = DecodeStatus::Success; bool Complete = false; Size = 0; @@ -184,7 +186,7 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Bytes.size() < HEXAGON_INSTR_SIZE) return MCDisassembler::Fail; MCInst *Inst = getContext().createMCInst(); - Result = getSingleInstruction(*Inst, MI, Bytes, Address, cs, Complete); + Result = getSingleInstruction(*Inst, MI, Bytes, Address, CS, Complete); MI.addOperand(MCOperand::createInst(Inst)); Size += HEXAGON_INSTR_SIZE; Bytes = Bytes.slice(HEXAGON_INSTR_SIZE); diff --git a/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp index 8ebead8a41c90..4a9006935917c 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp @@ -9,13 +9,13 @@ // number of instructions, set the prefLoopAlignment to 32 bytes (5). //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "hexagon-loop-align" - #include "HexagonTargetMachine.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/Support/Debug.h" +#define DEBUG_TYPE "hexagon-loop-align" + using namespace llvm; static cl::opt diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp index e9fecef4ac5b9..2720e1d9a6a64 100644 --- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp +++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp @@ -20,8 +20,11 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" +#define DEBUG_TYPE "lanai-disassembler" + using namespace llvm; typedef MCDisassembler::DecodeStatus DecodeStatus; diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h index 8aa28417896dc..04aeb3f27a3da 100644 --- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h +++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h @@ -15,8 +15,6 @@ #include "llvm/MC/MCDisassembler/MCDisassembler.h" -#define DEBUG_TYPE "lanai-disassembler" - namespace llvm { class LanaiDisassembler : public MCDisassembler { diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp index c03ef8d33220c..ae5922cba4ce3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp @@ -70,18 +70,13 @@ static void addKernelMetadata(Module &M, Function *F) { llvm::ConstantAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; - llvm::Metadata *BlockMDVals[] = { - llvm::ConstantAsMetadata::get(F), - llvm::MDString::get(Ctx, "maxclusterrank"), - llvm::ConstantAsMetadata::get( - llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; + F->addFnAttr("nvvm.maxclusterrank", "1"); + F->setCallingConv(CallingConv::PTX_Kernel); // Append metadata to nvvm.annotations. - F->setCallingConv(CallingConv::PTX_Kernel); MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals)); MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals)); MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals)); - MD->addOperand(llvm::MDNode::get(Ctx, BlockMDVals)); } static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 58ad92a8934a6..d18f25cb6fd48 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -989,6 +989,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16}, Expand); } + setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); + // No FPOW or FREM in PTX. // Now deduce the information based on the above mentioned @@ -2642,6 +2644,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return SDValue(); case ISD::FRAMEADDR: return SDValue(); + case ISD::ADDRSPACECAST: + return LowerADDRSPACECAST(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::INTRINSIC_W_CHAIN: @@ -2757,6 +2761,17 @@ unsigned NVPTXTargetLowering::getJumpTableEncoding() const { return MachineJumpTableInfo::EK_Inline; } +SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op, + SelectionDAG &DAG) const { + AddrSpaceCastSDNode *N = cast(Op.getNode()); + unsigned SrcAS = N->getSrcAddressSpace(); + unsigned DestAS = N->getDestAddressSpace(); + if (SrcAS != llvm::ADDRESS_SPACE_GENERIC && + DestAS != llvm::ADDRESS_SPACE_GENERIC) + return DAG.getUNDEF(Op.getValueType()); + return Op; +} + // This function is almost a copy of SelectionDAG::expandVAArg(). // The only diff is that this one produces loads from local address space. SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 5adf69d621552..74ec14ba5f8e3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -264,6 +264,7 @@ class NVPTXTargetLowering : public TargetLowering { const NVPTXSubtarget &STI; // cache the subtarget here SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; + SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index a41943880807c..430502d85dfb4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -179,6 +179,13 @@ static bool argHasNVVMAnnotation(const Value &Val, return false; } +static std::optional getFnAttrParsedInt(const Function &F, + StringRef Attr) { + return F.hasFnAttribute(Attr) + ? std::optional(F.getFnAttributeAsParsedInteger(Attr)) + : std::nullopt; +} + bool isParamGridConstant(const Value &V) { if (const Argument *Arg = dyn_cast(&V)) { // "grid_constant" counts argument indices starting from 1 @@ -277,7 +284,7 @@ std::optional getClusterDimz(const Function &F) { } std::optional getMaxClusterRank(const Function &F) { - return findOneNVVMAnnotation(&F, "maxclusterrank"); + return getFnAttrParsedInt(F, "nvvm.maxclusterrank"); } std::optional getReqNTIDx(const Function &F) { @@ -303,11 +310,11 @@ std::optional getReqNTID(const Function &F) { } std::optional getMinCTASm(const Function &F) { - return findOneNVVMAnnotation(&F, "minctasm"); + return getFnAttrParsedInt(F, "nvvm.minctasm"); } std::optional getMaxNReg(const Function &F) { - return findOneNVVMAnnotation(&F, "maxnreg"); + return getFnAttrParsedInt(F, "nvvm.maxnreg"); } MaybeAlign getAlign(const Function &F, unsigned Index) { diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 98d3615ebab58..9b23a5ab521c8 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -63,6 +63,7 @@ add_llvm_target(RISCVCodeGen RISCVVectorMaskDAGMutation.cpp RISCVVectorPeephole.cpp RISCVVLOptimizer.cpp + RISCVVMV0Elimination.cpp RISCVZacasABIFix.cpp GISel/RISCVCallLowering.cpp GISel/RISCVInstructionSelector.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index b1aee98739e85..851eea1352852 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -107,6 +107,9 @@ void initializeRISCVPreLegalizerCombinerPass(PassRegistry &); FunctionPass *createRISCVVLOptimizerPass(); void initializeRISCVVLOptimizerPass(PassRegistry &); + +FunctionPass *createRISCVVMV0EliminationPass(); +void initializeRISCVVMV0EliminationPass(PassRegistry &); } // namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index 87c07c3cd505f..1e7ce136dc327 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -46,6 +46,7 @@ include "RISCVMacroFusion.td" // RISC-V Scheduling Models //===----------------------------------------------------------------------===// +include "RISCVSchedGeneric.td" include "RISCVSchedMIPSP8700.td" include "RISCVSchedRocket.td" include "RISCVSchedSiFive7.td" diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index ec2e8f1d50264..fb2c5c62ef871 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -254,7 +254,6 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands( bool IsMasked, bool IsStridedOrIndexed, SmallVectorImpl &Operands, bool IsLoad, MVT *IndexVT) { SDValue Chain = Node->getOperand(0); - SDValue Glue; Operands.push_back(Node->getOperand(CurOp++)); // Base pointer. @@ -265,11 +264,8 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands( } if (IsMasked) { - // Mask needs to be copied to V0. SDValue Mask = Node->getOperand(CurOp++); - Chain = CurDAG->getCopyToReg(Chain, DL, RISCV::V0, Mask, SDValue()); - Glue = Chain.getValue(1); - Operands.push_back(CurDAG->getRegister(RISCV::V0, Mask.getValueType())); + Operands.push_back(Mask); } SDValue VL; selectVLOp(Node->getOperand(CurOp++), VL); @@ -291,8 +287,6 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands( } Operands.push_back(Chain); // Chain. - if (Glue) - Operands.push_back(Glue); } void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, unsigned NF, bool IsMasked, @@ -1844,19 +1838,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } - // Mask needs to be copied to V0. - SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, - RISCV::V0, Mask, SDValue()); - SDValue Glue = Chain.getValue(1); - SDValue V0 = CurDAG->getRegister(RISCV::V0, VT); - if (IsCmpConstant) { SDValue Imm = selectImm(CurDAG, SDLoc(Src2), XLenVT, CVal - 1, *Subtarget); ReplaceNode(Node, CurDAG->getMachineNode( VMSGTMaskOpcode, DL, VT, - {MaskedOff, Src1, Imm, V0, VL, SEW, Glue})); + {MaskedOff, Src1, Imm, Mask, VL, SEW})); return; } @@ -1867,7 +1855,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // the agnostic result can be either undisturbed or all 1. SDValue Cmp = SDValue( CurDAG->getMachineNode(VMSLTMaskOpcode, DL, VT, - {MaskedOff, Src1, Src2, V0, VL, SEW, Glue}), + {MaskedOff, Src1, Src2, Mask, VL, SEW}), 0); // vmxor.mm vd, vd, v0 is used to update active value. ReplaceNode(Node, CurDAG->getMachineNode(VMXOROpcode, DL, VT, @@ -3287,12 +3275,10 @@ static bool vectorPseudoHasAllNBitUsers(SDNode *User, unsigned UserOpNo, return false; assert(RISCVII::hasVLOp(TSFlags)); - bool HasGlueOp = User->getGluedNode() != nullptr; - unsigned ChainOpIdx = User->getNumOperands() - HasGlueOp - 1; + unsigned ChainOpIdx = User->getNumOperands() - 1; bool HasChainOp = User->getOperand(ChainOpIdx).getValueType() == MVT::Other; bool HasVecPolicyOp = RISCVII::hasVecPolicyOp(TSFlags); - unsigned VLIdx = - User->getNumOperands() - HasVecPolicyOp - HasChainOp - HasGlueOp - 2; + unsigned VLIdx = User->getNumOperands() - HasVecPolicyOp - HasChainOp - 2; const unsigned Log2SEW = User->getConstantOperandVal(VLIdx + 1); if (UserOpNo == VLIdx) @@ -3759,43 +3745,7 @@ bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) { return false; } -// After ISel, a vector pseudo's mask will be copied to V0 via a CopyToReg -// that's glued to the pseudo. This tries to look up the value that was copied -// to V0. -static SDValue getMaskSetter(SDValue MaskOp, SDValue GlueOp) { - // Check that we're using V0 as a mask register. - if (!isa(MaskOp) || - cast(MaskOp)->getReg() != RISCV::V0) - return SDValue(); - - // The glued user defines V0. - const auto *Glued = GlueOp.getNode(); - - if (!Glued || Glued->getOpcode() != ISD::CopyToReg) - return SDValue(); - - // Check that we're defining V0 as a mask register. - if (!isa(Glued->getOperand(1)) || - cast(Glued->getOperand(1))->getReg() != RISCV::V0) - return SDValue(); - - SDValue MaskSetter = Glued->getOperand(2); - - // Sometimes the VMSET is wrapped in a COPY_TO_REGCLASS, e.g. if the mask came - // from an extract_subvector or insert_subvector. - if (MaskSetter->isMachineOpcode() && - MaskSetter->getMachineOpcode() == RISCV::COPY_TO_REGCLASS) - MaskSetter = MaskSetter->getOperand(0); - - return MaskSetter; -} - -static bool usesAllOnesMask(SDValue MaskOp, SDValue GlueOp) { - // Check the instruction defining V0; it needs to be a VMSET pseudo. - SDValue MaskSetter = getMaskSetter(MaskOp, GlueOp); - if (!MaskSetter) - return false; - +static bool usesAllOnesMask(SDValue MaskOp) { const auto IsVMSet = [](unsigned Opc) { return Opc == RISCV::PseudoVMSET_M_B1 || Opc == RISCV::PseudoVMSET_M_B16 || Opc == RISCV::PseudoVMSET_M_B2 || Opc == RISCV::PseudoVMSET_M_B32 || @@ -3806,14 +3756,7 @@ static bool usesAllOnesMask(SDValue MaskOp, SDValue GlueOp) { // TODO: Check that the VMSET is the expected bitwidth? The pseudo has // undefined behaviour if it's the wrong bitwidth, so we could choose to // assume that it's all-ones? Same applies to its VL. - return MaskSetter->isMachineOpcode() && - IsVMSet(MaskSetter.getMachineOpcode()); -} - -// Return true if we can make sure mask of N is all-ones mask. -static bool usesAllOnesMask(SDNode *N, unsigned MaskOpIdx) { - return usesAllOnesMask(N->getOperand(MaskOpIdx), - N->getOperand(N->getNumOperands() - 1)); + return MaskOp->isMachineOpcode() && IsVMSet(MaskOp.getMachineOpcode()); } static bool isImplicitDef(SDValue V) { @@ -3829,9 +3772,7 @@ static bool isImplicitDef(SDValue V) { } // Optimize masked RVV pseudo instructions with a known all-ones mask to their -// corresponding "unmasked" pseudo versions. The mask we're interested in will -// take the form of a V0 physical register operand, with a glued -// register-setting instruction. +// corresponding "unmasked" pseudo versions. bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) { const RISCV::RISCVMaskedPseudoInfo *I = RISCV::getMaskedPseudoInfo(N->getMachineOpcode()); @@ -3839,7 +3780,7 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) { return false; unsigned MaskOpIdx = I->MaskOpIdx; - if (!usesAllOnesMask(N, MaskOpIdx)) + if (!usesAllOnesMask(N->getOperand(MaskOpIdx))) return false; // There are two classes of pseudos in the table - compares and @@ -3863,18 +3804,13 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) { // Skip the passthru operand at index 0 if the unmasked don't have one. bool ShouldSkip = !HasPassthru && MaskedHasPassthru; for (unsigned I = ShouldSkip, E = N->getNumOperands(); I != E; I++) { - // Skip the mask, and the Glue. + // Skip the mask SDValue Op = N->getOperand(I); - if (I == MaskOpIdx || Op.getValueType() == MVT::Glue) + if (I == MaskOpIdx) continue; Ops.push_back(Op); } - // Transitively apply any node glued to our new node. - const auto *Glued = N->getGluedNode(); - if (auto *TGlued = Glued->getGluedNode()) - Ops.push_back(SDValue(TGlued, TGlued->getNumValues() - 1)); - MachineSDNode *Result = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops); @@ -3910,17 +3846,13 @@ static bool IsVMerge(SDNode *N) { // The resulting policy is the effective policy the vmerge would have had, // i.e. whether or not it's passthru operand was implicit-def. bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { - SDValue Passthru, False, True, VL, Mask, Glue; + SDValue Passthru, False, True, VL, Mask; assert(IsVMerge(N)); Passthru = N->getOperand(0); False = N->getOperand(1); True = N->getOperand(2); Mask = N->getOperand(3); VL = N->getOperand(4); - // We always have a glue node for the mask at v0. - Glue = N->getOperand(N->getNumOperands() - 1); - assert(cast(Mask)->getReg() == RISCV::V0); - assert(Glue.getValueType() == MVT::Glue); // If the EEW of True is different from vmerge's SEW, then we can't fold. if (True.getSimpleValueType() != N->getSimpleValueType(0)) @@ -3963,12 +3895,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { if (TII->get(TrueOpc).hasUnmodeledSideEffects()) return false; - // The last operand of a masked instruction may be glued. - bool HasGlueOp = True->getGluedNode() != nullptr; - - // The chain operand may exist either before the glued operands or in the last - // position. - unsigned TrueChainOpIdx = True.getNumOperands() - HasGlueOp - 1; + unsigned TrueChainOpIdx = True.getNumOperands() - 1; bool HasChainOp = True.getOperand(TrueChainOpIdx).getValueType() == MVT::Other; @@ -3980,7 +3907,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { LoopWorklist.push_back(False.getNode()); LoopWorklist.push_back(Mask.getNode()); LoopWorklist.push_back(VL.getNode()); - LoopWorklist.push_back(Glue.getNode()); if (SDNode::hasPredecessorHelper(True.getNode(), Visited, LoopWorklist)) return false; } @@ -3988,7 +3914,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { // The vector policy operand may be present for masked intrinsics bool HasVecPolicyOp = RISCVII::hasVecPolicyOp(TrueTSFlags); unsigned TrueVLIndex = - True.getNumOperands() - HasVecPolicyOp - HasChainOp - HasGlueOp - 2; + True.getNumOperands() - HasVecPolicyOp - HasChainOp - 2; SDValue TrueVL = True.getOperand(TrueVLIndex); SDValue SEW = True.getOperand(TrueVLIndex + 1); @@ -4020,7 +3946,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { if (RISCVII::elementsDependOnVL(TrueBaseMCID.TSFlags) && (TrueVL != VL)) return false; if (RISCVII::elementsDependOnMask(TrueBaseMCID.TSFlags) && - (Mask && !usesAllOnesMask(Mask, Glue))) + (Mask && !usesAllOnesMask(Mask))) return false; // Make sure it doesn't raise any observable fp exceptions, since changing the @@ -4077,9 +4003,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { if (HasChainOp) Ops.push_back(True.getOperand(TrueChainOpIdx)); - // Add the glue for the CopyToReg of mask->v0. - Ops.push_back(Glue); - MachineSDNode *Result = CurDAG->getMachineNode(MaskedOpc, DL, True->getVTList(), Ops); Result->setFlags(True->getFlags()); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 13ce566f8def6..829eef2e4d9d9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5338,18 +5338,28 @@ static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN, /// Is this mask local (i.e. elements only move within their local span), and /// repeating (that is, the same rearrangement is being done within each span)? static bool isLocalRepeatingShuffle(ArrayRef Mask, int Span) { - // TODO: Could improve the case where undef elements exist in the first span. + SmallVector LowSpan(Span, -1); for (auto [I, M] : enumerate(Mask)) { if (M == -1) continue; - int ChunkLo = I - (I % Span); - int ChunkHi = ChunkLo + Span; - if (M < ChunkLo || M >= ChunkHi || M - ChunkLo != Mask[I % Span]) + if ((M / Span) != (int)(I / Span)) + return false; + int SpanIdx = I % Span; + int Expected = M % Span; + if (LowSpan[SpanIdx] == -1) + LowSpan[SpanIdx] = Expected; + if (LowSpan[SpanIdx] != Expected) return false; } return true; } +/// Is this mask only using elements from the first span of the input? +static bool isLowSourceShuffle(ArrayRef Mask, int Span) { + return all_of(Mask, + [&](const auto &Idx) { return Idx == -1 || Idx < Span; }); +} + /// Try to widen element type to get a new mask value for a better permutation /// sequence. This doesn't try to inspect the widened mask for profitability; /// we speculate the widened form is equal or better. This has the effect of @@ -5739,12 +5749,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget); SDValue Gather; - // If we have a locally repeating mask, then we can reuse the first register - // in the index register group for all registers within the source register - // group. TODO: This generalizes to m2, and m4. - const MVT M1VT = getLMUL1VT(ContainerVT); - auto VLMAX = RISCVTargetLowering::computeVLMAXBounds(M1VT, Subtarget).first; - if (ContainerVT.bitsGT(M1VT) && isLocalRepeatingShuffle(Mask, VLMAX)) { + if (NumElts > MinVLMAX && isLocalRepeatingShuffle(Mask, MinVLMAX)) { + // If we have a locally repeating mask, then we can reuse the first + // register in the index register group for all registers within the + // source register group. TODO: This generalizes to m2, and m4. + const MVT M1VT = getLMUL1VT(ContainerVT); EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType()); SDValue SubIndex = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices, @@ -5766,6 +5775,41 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather, SubVec, SubIdx); } + } else if (NumElts > MinVLMAX && isLowSourceShuffle(Mask, MinVLMAX)) { + // If we have a shuffle which only uses the first register in our + // source register group, we can do a linear number of m1 vrgathers + // reusing the same source register (but with different indices) + // TODO: This can be generalized for m2 or m4, or for any shuffle + // for which we can do a vslidedown followed by this expansion. + const MVT M1VT = getLMUL1VT(ContainerVT); + EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType()); + auto [InnerTrueMask, InnerVL] = + getDefaultScalableVLOps(M1VT, DL, DAG, Subtarget); + int N = ContainerVT.getVectorMinNumElements() / + M1VT.getVectorMinNumElements(); + assert(isPowerOf2_32(N) && N <= 8); + Gather = DAG.getUNDEF(ContainerVT); + SDValue SlideAmt = + DAG.getElementCount(DL, XLenVT, M1VT.getVectorElementCount()); + SDValue SubV1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1, + DAG.getVectorIdxConstant(0, DL)); + for (int i = 0; i < N; i++) { + if (i != 0) + LHSIndices = getVSlidedown(DAG, Subtarget, DL, IndexContainerVT, + DAG.getUNDEF(IndexContainerVT), LHSIndices, + SlideAmt, TrueMask, VL); + SDValue SubIndex = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices, + DAG.getVectorIdxConstant(0, DL)); + SDValue SubVec = + DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex, + DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL); + SDValue SubIdx = + DAG.getVectorIdxConstant(M1VT.getVectorMinNumElements() * i, DL); + Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather, + SubVec, SubIdx); + } } else { Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices, DAG.getUNDEF(ContainerVT), TrueMask, VL); @@ -5790,8 +5834,8 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // as a vselect + a single source vrgather.vv. Don't do this if we think the // operands may end up being lowered to something cheaper than a vrgather.vv. if (!DAG.isSplatValue(V2) && !DAG.isSplatValue(V1) && - !ShuffleVectorSDNode::isSplatMask(ShuffleMaskLHS.data(), VT) && - !ShuffleVectorSDNode::isSplatMask(ShuffleMaskRHS.data(), VT) && + !ShuffleVectorSDNode::isSplatMask(ShuffleMaskLHS) && + !ShuffleVectorSDNode::isSplatMask(ShuffleMaskRHS) && !ShuffleVectorInst::isIdentityMask(ShuffleMaskLHS, NumElts) && !ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts)) if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget)) @@ -5834,7 +5878,7 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { return false; // Support splats for any type. These should type legalize well. - if (ShuffleVectorSDNode::isSplatMask(M.data(), VT)) + if (ShuffleVectorSDNode::isSplatMask(M)) return true; const unsigned NumElts = M.size(); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 1c81719c767ec..ec628620d2982 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -330,9 +330,6 @@ isRVVSpillForZvlsseg(unsigned Opcode); bool isFaultFirstLoad(const MachineInstr &MI); -// Implemented in RISCVGenInstrInfo.inc -int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); - // Return true if both input instructions have equal rounding mode. If at least // one of the instructions does not have rounding mode, false will be returned. bool hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index fe85d4b074c87..77f41e3c202c7 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -3935,7 +3935,7 @@ class VPatUnaryMask(intrinsic_name#"_mask") (result_type result_reg_class:$passthru), (op2_type op2_reg_class:$rs2), - (mask_type V0), + (mask_type VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast( !if(isSEWAware, @@ -3943,7 +3943,7 @@ class VPatUnaryMask; + (mask_type VMV0:$vm), GPR:$vl, log2sew, (XLenVT timm:$policy))>; class VPatUnaryMaskRoundingMode(intrinsic_name#"_mask") (result_type result_reg_class:$passthru), (op2_type op2_reg_class:$rs2), - (mask_type V0), + (mask_type VMV0:$vm), (XLenVT timm:$round), VLOpFrag, (XLenVT timm:$policy))), (!cast( @@ -3968,7 +3968,7 @@ class VPatUnaryMaskRoundingMode; @@ -3986,7 +3986,7 @@ class VPatUnaryMaskRTZ(intrinsic_name#"_mask") (result_type result_reg_class:$passthru), (op2_type op2_reg_class:$rs2), - (mask_type V0), + (mask_type VMV0:$vm), (XLenVT 0b001), VLOpFrag, (XLenVT timm:$policy))), (!cast( @@ -3995,7 +3995,7 @@ class VPatUnaryMaskRTZ; class VPatMaskUnaryNoMask(intrinsic_name#"_mask") (mti.Mask VR:$passthru), (mti.Mask VR:$rs2), - (mti.Mask V0), + (mti.Mask VMV0:$vm), VLOpFrag)), (!cast(inst#"_M_"#mti.BX#"_MASK") (mti.Mask VR:$passthru), (mti.Mask VR:$rs2), - (mti.Mask V0), GPR:$vl, mti.Log2SEW, TU_MU)>; + (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW, TU_MU)>; class VPatUnaryAnyMask(inst#"_MASK") (result_type result_reg_class:$passthru), (op1_type op1_reg_class:$rs1), (op2_type op2_kind:$rs2), - (mask_type V0), GPR:$vl, sew)>; + (mask_type VMV0:$vm), GPR:$vl, sew)>; class VPatBinaryMaskPolicy(inst#"_MASK") (result_type result_reg_class:$passthru), (op1_type op1_reg_class:$rs1), (op2_type op2_kind:$rs2), - (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>; + (mask_type VMV0:$vm), GPR:$vl, sew, (XLenVT timm:$policy))>; class VPatBinaryMaskPolicyRoundingMode(inst#"_MASK") (result_type result_reg_class:$passthru), (op1_type op1_reg_class:$rs1), (op2_type op2_kind:$rs2), - (mask_type V0), + (mask_type VMV0:$vm), (XLenVT timm:$round), GPR:$vl, sew, (XLenVT timm:$policy))>; @@ -4204,13 +4204,13 @@ class VPatBinaryMaskSwapped(inst#"_MASK") (result_type result_reg_class:$passthru), (op1_type op1_reg_class:$rs1), (op2_type op2_kind:$rs2), - (mask_type V0), GPR:$vl, sew)>; + (mask_type VMV0:$vm), GPR:$vl, sew)>; class VPatTiedBinaryNoMask(inst#"_MASK_TIED") (result_type result_reg_class:$passthru), (op2_type op2_kind:$rs2), - (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>; + (mask_type VMV0:$vm), GPR:$vl, sew, (XLenVT timm:$policy))>; class VPatTiedBinaryMaskRoundingMode(inst#"_MASK_TIED") (result_type result_reg_class:$passthru), (op2_type op2_kind:$rs2), - (mask_type V0), + (mask_type VMV0:$vm), (XLenVT timm:$round), GPR:$vl, sew, (XLenVT timm:$policy))>; @@ -4437,13 +4437,13 @@ class VPatTernaryMaskPolicy(inst#"_"#kind#"_"#vlmul.MX # "_MASK") result_reg_class:$rs3, (op1_type op1_reg_class:$rs1), op2_kind:$rs2, - (mask_type V0), + (mask_type VMV0:$vm), GPR:$vl, sew, (XLenVT timm:$policy))>; class VPatTernaryMaskPolicyRoundingMode(!if(isSEWAware, @@ -4472,7 +4472,7 @@ class VPatTernaryMaskPolicyRoundingMode; @@ -4492,13 +4492,13 @@ class VPatTernaryMaskTU(inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)# "_MASK") result_reg_class:$rs3, (op1_type op1_reg_class:$rs1), op2_kind:$rs2, - (mask_type V0), + (mask_type VMV0:$vm), GPR:$vl, log2sew, TU_MU)>; class VPatTernaryMaskTURoundingMode(inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)# "_MASK") result_reg_class:$rs3, (op1_type op1_reg_class:$rs1), op2_kind:$rs2, - (mask_type V0), + (mask_type VMV0:$vm), (XLenVT timm:$round), GPR:$vl, log2sew, TU_MU)>; @@ -4536,9 +4536,9 @@ multiclass VPatUnaryS_M(inst#"_M_"#mti.BX) $rs1, GPR:$vl, mti.Log2SEW)>; def : Pat<(XLenVT (!cast(intrinsic_name # "_mask") - (mti.Mask VR:$rs1), (mti.Mask V0), VLOpFrag)), + (mti.Mask VR:$rs1), (mti.Mask VMV0:$vm), VLOpFrag)), (!cast(inst#"_M_"#mti.BX#"_MASK") $rs1, - (mti.Mask V0), GPR:$vl, mti.Log2SEW)>; + (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW)>; } } @@ -4626,9 +4626,9 @@ multiclass VPatNullaryV { vti.RegClass:$passthru, GPR:$vl, vti.Log2SEW, TU_MU)>; def : Pat<(vti.Vector (!cast(intrinsic # "_mask") (vti.Vector vti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag, (XLenVT timm:$policy))), + (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast(instruction#"_V_" # vti.LMul.MX # "_MASK") - vti.RegClass:$passthru, (vti.Mask V0), + vti.RegClass:$passthru, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>; } } @@ -4726,13 +4726,13 @@ multiclass VPatBinaryCarryInTAIL(inst#"_"#kind#"_"#vlmul.MX) (result_type result_reg_class:$passthru), (op1_type op1_reg_class:$rs1), (op2_type op2_kind:$rs2), - (mask_type V0), GPR:$vl, sew)>; + (mask_type VMV0:$vm), GPR:$vl, sew)>; } multiclass VPatBinaryCarryIn(intrinsic) (op1_type op1_reg_class:$rs1), (op2_type op2_kind:$rs2), - (mask_type V0), + (mask_type VMV0:$vm), VLOpFrag)), (!cast(inst#"_"#kind#"_"#vlmul.MX) (op1_type op1_reg_class:$rs1), (op2_type op2_kind:$rs2), - (mask_type V0), GPR:$vl, sew)>; + (mask_type VMV0:$vm), GPR:$vl, sew)>; } multiclass VPatBinaryMaskOut; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>; } } @@ -6164,14 +6164,14 @@ foreach vti = AllIntegerVectors in { def : Pat<(vti.Vector (int_riscv_vrsub_mask (vti.Vector vti.RegClass:$passthru), (vti.Vector vti.RegClass:$rs2), (vti.Vector vti.RegClass:$rs1), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast("PseudoVSUB_VV_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>; @@ -6190,14 +6190,14 @@ foreach vti = AllIntegerVectors in { def : Pat<(vti.Vector (int_riscv_vsub_mask (vti.Vector vti.RegClass:$passthru), (vti.Vector vti.RegClass:$rs1), (vti.Scalar simm5_plus1:$rs2), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast("PseudoVADD_VI_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, (NegImm simm5_plus1:$rs2), - (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>; @@ -6834,14 +6834,14 @@ foreach vti = AllIntegerVectors in { def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$passthru), (vti.Vector vti.RegClass:$rs1), (XLenVT 1), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast("PseudoVADD_VV_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs1, - (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>; @@ -7184,9 +7184,9 @@ foreach fvti = AllFloatVectors in { def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$passthru), (fvti.Vector fvti.RegClass:$rs2), (fvti.Scalar (fpimm0)), - (fvti.Mask V0), VLOpFrag)), + (fvti.Mask VMV0:$vm), VLOpFrag)), (instr fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, - (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index c588e047c2ac8..2bd61883760e5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -928,16 +928,16 @@ foreach vtiToWti = AllWidenableIntVectors in { (!cast("PseudoVWADDU_VV_"#vti.LMul.MX) (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW, TA_MA)>; - def : Pat<(shl (wti.Vector (riscv_sext_vl_oneuse (vti.Vector vti.RegClass:$rs1), (vti.Mask V0), VLOpFrag)), + def : Pat<(shl (wti.Vector (riscv_sext_vl_oneuse (vti.Vector vti.RegClass:$rs1), (vti.Mask VMV0:$vm), VLOpFrag)), (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), 1, (XLenVT srcvalue)))), (!cast("PseudoVWADD_VV_"#vti.LMul.MX#"_MASK") (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(shl (wti.Vector (riscv_zext_vl_oneuse (vti.Vector vti.RegClass:$rs1), (vti.Mask V0), VLOpFrag)), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + def : Pat<(shl (wti.Vector (riscv_zext_vl_oneuse (vti.Vector vti.RegClass:$rs1), (vti.Mask VMV0:$vm), VLOpFrag)), (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), 1, (XLenVT srcvalue)))), (!cast("PseudoVWADDU_VV_"#vti.LMul.MX#"_MASK") (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } @@ -1081,24 +1081,24 @@ defm : VPatWidenMulAddSDNode_VX; // 11.15. Vector Integer Merge Instructions foreach vti = AllIntegerVectors in { let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(vti.Vector (vselect (vti.Mask V0), vti.RegClass:$rs1, + def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), vti.RegClass:$rs1, vti.RegClass:$rs2)), (!cast("PseudoVMERGE_VVM_"#vti.LMul.MX) (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs2, vti.RegClass:$rs1, (vti.Mask V0), + vti.RegClass:$rs2, vti.RegClass:$rs1, (vti.Mask VMV0:$vm), vti.AVL, vti.Log2SEW)>; - def : Pat<(vti.Vector (vselect (vti.Mask V0), (SplatPat XLenVT:$rs1), + def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat XLenVT:$rs1), vti.RegClass:$rs2)), (!cast("PseudoVMERGE_VXM_"#vti.LMul.MX) (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs2, GPR:$rs1, (vti.Mask V0), vti.AVL, vti.Log2SEW)>; + vti.RegClass:$rs2, GPR:$rs1, (vti.Mask VMV0:$vm), vti.AVL, vti.Log2SEW)>; - def : Pat<(vti.Vector (vselect (vti.Mask V0), (SplatPat_simm5 simm5:$rs1), + def : Pat<(vti.Vector (vselect (vti.Mask VMV0:$vm), (SplatPat_simm5 simm5:$rs1), vti.RegClass:$rs2)), (!cast("PseudoVMERGE_VIM_"#vti.LMul.MX) (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), vti.AVL, vti.Log2SEW)>; + vti.RegClass:$rs2, simm5:$rs1, (vti.Mask VMV0:$vm), vti.AVL, vti.Log2SEW)>; } } @@ -1348,39 +1348,39 @@ defm : VPatFPSetCCSDNode_VV_VF_FV; foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { defvar ivti = GetIntVTypeInfo.Vti; let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(fvti.Vector (vselect (fvti.Mask V0), fvti.RegClass:$rs1, + def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1, fvti.RegClass:$rs2)), (!cast("PseudoVMERGE_VVM_"#fvti.LMul.MX) (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0), + fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; - def : Pat<(fvti.Vector (vselect (fvti.Mask V0), + def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), fvti.RegClass:$rs2)), (!cast("PseudoVMERGE_VXM_"#fvti.LMul.MX) (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>; + fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; - def : Pat<(fvti.Vector (vselect (fvti.Mask V0), + def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), (SplatFPOp (fvti.Scalar fpimm0)), fvti.RegClass:$rs2)), (!cast("PseudoVMERGE_VIM_"#fvti.LMul.MX) (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, 0, (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>; + fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; } } foreach fvti = AllFloatVectors in { let Predicates = GetVTypePredicates.Predicates in - def : Pat<(fvti.Vector (vselect (fvti.Mask V0), + def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), (SplatFPOp fvti.ScalarRegClass:$rs1), fvti.RegClass:$rs2)), (!cast("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX) (fvti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs2, (fvti.Scalar fvti.ScalarRegClass:$rs1), - (fvti.Mask V0), fvti.AVL, fvti.Log2SEW)>; + (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; } // 13.17. Vector Single-Width Floating-Point/Integer Type-Convert Instructions diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index f63c1560f6253..c914dc2ffbcd3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -633,7 +633,7 @@ class VPatBinaryVL_V( !if(isSEWAware, @@ -642,7 +642,7 @@ class VPatBinaryVL_V; + (mask_type VMV0:$vm), GPR:$vl, log2sew, TAIL_AGNOSTIC)>; class VPatBinaryVL_V_RM( !if(isSEWAware, @@ -670,7 +670,7 @@ class VPatBinaryVL_V_RM(instruction_name#"_"#suffix#"_"# vlmul.MX#"_MASK_TIED") result_reg_class:$rs1, op2_reg_class:$rs2, - (mask_type V0), GPR:$vl, sew, TU_MU)>; + (mask_type VMV0:$vm), GPR:$vl, sew, TU_MU)>; multiclass VPatTiedBinaryNoMaskVL_V_RM( !if(isSEWAware, @@ -802,7 +802,7 @@ class VPatBinaryVL_XI; + (mask_type VMV0:$vm), GPR:$vl, log2sew, TAIL_AGNOSTIC)>; multiclass VPatBinaryVL_VV_VX vtilist = AllIntegerVectors, @@ -894,7 +894,7 @@ class VPatBinaryVL_VF( !if(isSEWAware, @@ -903,7 +903,7 @@ class VPatBinaryVL_VF; + (mask_type VMV0:$vm), GPR:$vl, log2sew, TAIL_AGNOSTIC)>; class VPatBinaryVL_VF_RM( !if(isSEWAware, @@ -929,7 +929,7 @@ class VPatBinaryVL_VF_RM( !if(isSEWAware, @@ -982,7 +982,7 @@ multiclass VPatBinaryFPVL_R_VF; + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>; } } @@ -993,7 +993,7 @@ multiclass VPatBinaryFPVL_R_VF_RM( !if(isSEWAware, @@ -1001,7 +1001,7 @@ multiclass VPatBinaryFPVL_R_VF_RM(instruction_name#"_VV_"#vti.LMul.MX#"_MASK") VR:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>; } // Inherits from VPatIntegerSetCCVL_VV and adds a pattern with operands swapped. @@ -1030,11 +1030,11 @@ multiclass VPatIntegerSetCCVL_VV_Swappable(instruction_name#"_VV_"#vti.LMul.MX#"_MASK") VR:$passthru, vti.RegClass:$rs1, - vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>; } multiclass VPatIntegerSetCCVL_VX_Swappable; + GPR:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>; def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat (XLenVT GPR:$rs2)), (vti.Vector vti.RegClass:$rs1), invcc, VR:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (instruction_masked VR:$passthru, vti.RegClass:$rs1, - GPR:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + GPR:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>; } multiclass VPatIntegerSetCCVL_VI_Swappable; // FIXME: Can do some canonicalization to remove these patterns. def : Pat<(vti.Mask (riscv_setcc_vl (splatpat_kind simm5:$rs2), (vti.Vector vti.RegClass:$rs1), invcc, VR:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (instruction_masked VR:$passthru, vti.RegClass:$rs1, - simm5:$rs2, (vti.Mask V0), GPR:$vl, + simm5:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>; } @@ -1089,31 +1089,31 @@ multiclass VPatFPSetCCVL_VV_VF_FV(inst_name#"_VV_"#fvti.LMul.MX#"_MASK") VR:$passthru, fvti.RegClass:$rs1, - fvti.RegClass:$rs2, (fvti.Mask V0), + fvti.RegClass:$rs2, (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; def : Pat<(fvti.Mask (vop (fvti.Vector fvti.RegClass:$rs1), (SplatFPOp fvti.ScalarRegClass:$rs2), cc, VR:$passthru, - (fvti.Mask V0), + (fvti.Mask VMV0:$vm), VLOpFrag)), (!cast(inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK") VR:$passthru, fvti.RegClass:$rs1, - fvti.ScalarRegClass:$rs2, (fvti.Mask V0), + fvti.ScalarRegClass:$rs2, (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; def : Pat<(fvti.Mask (vop (SplatFPOp fvti.ScalarRegClass:$rs2), (fvti.Vector fvti.RegClass:$rs1), cc, VR:$passthru, - (fvti.Mask V0), + (fvti.Mask VMV0:$vm), VLOpFrag)), (!cast(swapped_op_inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK") VR:$passthru, fvti.RegClass:$rs1, - fvti.ScalarRegClass:$rs2, (fvti.Mask V0), + fvti.ScalarRegClass:$rs2, (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; } } @@ -1127,11 +1127,11 @@ multiclass VPatExtendVL_V.Predicates, GetVTypePredicates.Predicates) in def : Pat<(vti.Vector (vop (fti.Vector fti.RegClass:$rs2), - (fti.Mask V0), VLOpFrag)), + (fti.Mask VMV0:$vm), VLOpFrag)), (!cast(inst_name#"_"#suffix#"_"#vti.LMul.MX#"_MASK") (vti.Vector (IMPLICIT_DEF)), fti.RegClass:$rs2, - (fti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>; + (fti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; } } @@ -1143,11 +1143,11 @@ multiclass VPatConvertFP2IVL_V { let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1), - (fvti.Mask V0), + (fvti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_"#ivti.LMul.MX#"_MASK") (ivti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, - (fvti.Mask V0), GPR:$vl, ivti.Log2SEW, TA_MA)>; + (fvti.Mask VMV0:$vm), GPR:$vl, ivti.Log2SEW, TA_MA)>; } } @@ -1158,11 +1158,11 @@ multiclass VPatConvertFP2I_RM_VL_V.Predicates, GetVTypePredicates.Predicates) in def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1), - (fvti.Mask V0), (XLenVT timm:$frm), + (fvti.Mask VMV0:$vm), (XLenVT timm:$frm), VLOpFrag)), (!cast(instruction_name#"_"#ivti.LMul.MX#"_MASK") (ivti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, - (fvti.Mask V0), timm:$frm, GPR:$vl, ivti.Log2SEW, + (fvti.Mask VMV0:$vm), timm:$frm, GPR:$vl, ivti.Log2SEW, TA_MA)>; } } @@ -1173,11 +1173,11 @@ multiclass VPatConvertI2FPVL_V_RM.Predicates, GetVTypePredicates.Predicates) in def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1), - (ivti.Mask V0), + (ivti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fvti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1, - (ivti.Mask V0), + (ivti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, @@ -1191,11 +1191,11 @@ multiclass VPatConvertI2FP_RM_VL_V { let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1), - (ivti.Mask V0), (XLenVT timm:$frm), + (ivti.Mask VMV0:$vm), (XLenVT timm:$frm), VLOpFrag)), (!cast(instruction_name#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fvti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1, - (ivti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>; + (ivti.Mask VMV0:$vm), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>; } } @@ -1208,11 +1208,11 @@ multiclass VPatWConvertFP2IVL_V let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1), - (fvti.Mask V0), + (fvti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_"#fvti.LMul.MX#"_MASK") (iwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, - (fvti.Mask V0), GPR:$vl, fvti.Log2SEW, TA_MA)>; + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TA_MA)>; } } @@ -1224,11 +1224,11 @@ multiclass VPatWConvertFP2I_RM_VL_V { let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1), - (fvti.Mask V0), (XLenVT timm:$frm), + (fvti.Mask VMV0:$vm), (XLenVT timm:$frm), VLOpFrag)), (!cast(instruction_name#"_"#fvti.LMul.MX#"_MASK") (iwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, - (fvti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>; + (fvti.Mask VMV0:$vm), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>; } } @@ -1240,11 +1240,11 @@ multiclass VPatWConvertI2FPVL_V.Predicates, GetVTypePredicates.Predicates) in def : Pat<(fwti.Vector (vop (ivti.Vector ivti.RegClass:$rs1), - (ivti.Mask V0), + (ivti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_"#ivti.LMul.MX#"_E"#ivti.SEW#"_MASK") (fwti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1, - (ivti.Mask V0), + (ivti.Mask VMV0:$vm), GPR:$vl, ivti.Log2SEW, TA_MA)>; } } @@ -1261,11 +1261,11 @@ multiclass VPatNConvertFP2IVL_W.Predicates, GetVTypePredicates.Predicates) in def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1), - (fwti.Mask V0), + (fwti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_"#vti.LMul.MX#"_MASK") (vti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, - (fwti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>; + (fwti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; } } @@ -1276,11 +1276,11 @@ multiclass VPatNConvertFP2I_RM_VL_W { let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1), - (fwti.Mask V0), (XLenVT timm:$frm), + (fwti.Mask VMV0:$vm), (XLenVT timm:$frm), VLOpFrag)), (!cast(instruction_name#"_"#vti.LMul.MX#"_MASK") (vti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, - (fwti.Mask V0), timm:$frm, GPR:$vl, vti.Log2SEW, TA_MA)>; + (fwti.Mask VMV0:$vm), timm:$frm, GPR:$vl, vti.Log2SEW, TA_MA)>; } } @@ -1292,11 +1292,11 @@ multiclass VPatNConvertI2FPVL_W_RM.Predicates, GetVTypePredicates.Predicates) in def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1), - (iwti.Mask V0), + (iwti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fvti.Vector (IMPLICIT_DEF)), iwti.RegClass:$rs1, - (iwti.Mask V0), + (iwti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, @@ -1311,11 +1311,11 @@ multiclass VPatNConvertI2FP_RM_VL_W { let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1), - (iwti.Mask V0), (XLenVT timm:$frm), + (iwti.Mask VMV0:$vm), (XLenVT timm:$frm), VLOpFrag)), (!cast(instruction_name#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fvti.Vector (IMPLICIT_DEF)), iwti.RegClass:$rs1, - (iwti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>; + (iwti.Mask VMV0:$vm), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>; } } @@ -1325,13 +1325,13 @@ multiclass VPatReductionVL { let Predicates = GetVTypePredicates.Predicates in { def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1), VR:$rs2, - (vti.Mask V0), VLOpFrag, + (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK") (vti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1), (vti_m1.Vector VR:$rs2), - (vti.Mask V0), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>; } } } @@ -1342,13 +1342,13 @@ multiclass VPatReductionVL_RM let Predicates = GetVTypePredicates.Predicates in { def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1), VR:$rs2, - (vti.Mask V0), VLOpFrag, + (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK") (vti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1), (vti_m1.Vector VR:$rs2), - (vti.Mask V0), + (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, @@ -1407,11 +1407,11 @@ multiclass VPatWidenReductionVL.Predicates) in { def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru), (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))), - VR:$rs2, (vti.Mask V0), VLOpFrag, + VR:$rs2, (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK") (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1), - (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW, + (wti_m1.Vector VR:$rs2), (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>; } } @@ -1426,11 +1426,11 @@ multiclass VPatWidenReductionVL_Ext_VL.Predicates) in { def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru), (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), (XLenVT srcvalue))), - VR:$rs2, (vti.Mask V0), VLOpFrag, + VR:$rs2, (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK") (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1), - (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW, + (wti_m1.Vector VR:$rs2), (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>; } } @@ -1445,11 +1445,11 @@ multiclass VPatWidenReductionVL_Ext_VL_RM.Predicates) in { def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru), (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), (XLenVT srcvalue))), - VR:$rs2, (vti.Mask V0), VLOpFrag, + VR:$rs2, (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK") (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1), - (wti_m1.Vector VR:$rs2), (vti.Mask V0), + (wti_m1.Vector VR:$rs2), (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, @@ -1575,10 +1575,10 @@ multiclass VPatNarrowShiftExtVL_WV(instruction_name#"_WV_"#vti.LMul.MX#"_MASK") (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs2, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; } } @@ -1621,7 +1621,7 @@ multiclass VPatMultiplyAccVL_VV_VX { foreach vti = AllIntegerVectors in { defvar suffix = vti.LMul.MX; let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (op vti.RegClass:$rd, (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2, srcvalue, (vti.Mask true_mask), VLOpFrag), @@ -1629,8 +1629,8 @@ multiclass VPatMultiplyAccVL_VV_VX { vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag), (!cast(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>; + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (op vti.RegClass:$rd, (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2, srcvalue, (vti.Mask true_mask), VLOpFrag), @@ -1638,8 +1638,8 @@ multiclass VPatMultiplyAccVL_VV_VX { vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag), (!cast(instruction_name#"_VX_"# suffix #"_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>; + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (op vti.RegClass:$rd, (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2, srcvalue, (vti.Mask true_mask), VLOpFrag), @@ -1647,8 +1647,8 @@ multiclass VPatMultiplyAccVL_VV_VX { vti.RegClass:$rd, undef, VLOpFrag), (!cast(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (op vti.RegClass:$rd, (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2, srcvalue, (vti.Mask true_mask), VLOpFrag), @@ -1656,7 +1656,7 @@ multiclass VPatMultiplyAccVL_VV_VX { vti.RegClass:$rd, undef, VLOpFrag), (!cast(instruction_name#"_VX_"# suffix #"_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } } @@ -1670,17 +1670,17 @@ multiclass VPatWidenMultiplyAddVL_VV_VX { def : Pat<(vwmacc_op (vti.Vector vti.RegClass:$rs1), (vti.Vector vti.RegClass:$rs2), (wti.Vector wti.RegClass:$rd), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast(instr_name#"_VV_"#vti.LMul.MX#"_MASK") wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(vwmacc_op (SplatPat XLenVT:$rs1), (vti.Vector vti.RegClass:$rs2), (wti.Vector wti.RegClass:$rd), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast(instr_name#"_VX_"#vti.LMul.MX#"_MASK") wti.RegClass:$rd, vti.ScalarRegClass:$rs1, - vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW, + vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } @@ -1713,19 +1713,19 @@ multiclass VPatFPMulAddVL_VV_VF defvar suffix = vti.LMul.MX; let Predicates = GetVTypePredicates.Predicates in { def : Pat<(vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rd, - vti.RegClass:$rs2, (vti.Mask V0), + vti.RegClass:$rs2, (vti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; def : Pat<(vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rd, vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; } } } @@ -1735,11 +1735,11 @@ multiclass VPatFPMulAddVL_VV_VF_RM.Predicates in { def : Pat<(vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rd, - vti.RegClass:$rs2, (vti.Mask V0), + vti.RegClass:$rs2, (vti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, @@ -1747,11 +1747,11 @@ multiclass VPatFPMulAddVL_VV_VF_RM(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, @@ -1764,34 +1764,34 @@ multiclass VPatFPMulAccVL_VV_VF { foreach vti = AllFloatVectors in { defvar suffix = vti.LMul.MX; let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag), (!cast(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>; + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag), (!cast(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>; + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), vti.RegClass:$rd, undef, VLOpFrag), (!cast(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), vti.RegClass:$rd, undef, VLOpFrag), (!cast(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } } @@ -1800,46 +1800,46 @@ multiclass VPatFPMulAccVL_VV_VF_RM { foreach vti = AllFloatVectors in { defvar suffix = vti.LMul.MX # "_E" # vti.SEW; let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag), (!cast(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag), (!cast(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), vti.RegClass:$rd, undef, VLOpFrag), (!cast(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(riscv_vmerge_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask VMV0:$vm), (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), vti.RegClass:$rd, undef, VLOpFrag), (!cast(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, @@ -1856,18 +1856,18 @@ multiclass VPatWidenFPMulAccVL_VV_VF { GetVTypePredicates.Predicates) in { def : Pat<(vop (vti.Vector vti.RegClass:$rs1), (vti.Vector vti.RegClass:$rs2), - (wti.Vector wti.RegClass:$rd), (vti.Mask V0), + (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm), VLOpFrag), (!cast(instruction_name#"_VV_"#vti.LMul.MX #"_MASK") wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; def : Pat<(vop (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)), (vti.Vector vti.RegClass:$rs2), - (wti.Vector wti.RegClass:$rd), (vti.Mask V0), + (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm), VLOpFrag), (!cast(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX #"_MASK") wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; } } } @@ -1886,22 +1886,22 @@ multiclass VPatWidenFPMulAccVL_VV_VF_RM(instruction_name#"_VV_"#suffix#"_MASK") wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, GPR:$vl, vti.Log2SEW, TA_MA)>; def : Pat<(vop (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)), (vti.Vector vti.RegClass:$rs2), - (wti.Vector wti.RegClass:$rd), (vti.Mask V0), + (wti.Vector wti.RegClass:$rd), (vti.Mask VMV0:$vm), VLOpFrag), (!cast(instruction_name#"_V"#vti.ScalarSuffix#"_"#suffix#"_MASK") wti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, @@ -1916,20 +1916,20 @@ multiclass VPatSlideVL_VX_VI { let Predicates = GetVTypePredicates.Predicates in { def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rd), (vti.Vector vti.RegClass:$rs1), - uimm5:$rs2, (vti.Mask V0), + uimm5:$rs2, (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast(instruction_name#"_VI_"#vti.LMul.MX#"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, uimm5:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>; def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rd), (vti.Vector vti.RegClass:$rs1), - GPR:$rs2, (vti.Mask V0), + GPR:$rs2, (vti.Mask VMV0:$vm), VLOpFrag, (XLenVT timm:$policy))), (!cast(instruction_name#"_VX_"#vti.LMul.MX#"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, GPR:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>; } } @@ -1940,10 +1940,10 @@ multiclass VPatSlide1VL_VX { let Predicates = GetVTypePredicates.Predicates in { def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rs3), (vti.Vector vti.RegClass:$rs1), - GPR:$rs2, (vti.Mask V0), VLOpFrag)), + GPR:$rs2, (vti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_VX_"#vti.LMul.MX#"_MASK") vti.RegClass:$rs3, vti.RegClass:$rs1, GPR:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>; } } } @@ -1953,10 +1953,10 @@ multiclass VPatSlide1VL_VF { let Predicates = GetVTypePredicates.Predicates in { def : Pat<(vti.Vector (vop (vti.Vector vti.RegClass:$rs3), (vti.Vector vti.RegClass:$rs1), - vti.Scalar:$rs2, (vti.Mask V0), VLOpFrag)), + vti.Scalar:$rs2, (vti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_MASK") vti.RegClass:$rs3, vti.RegClass:$rs1, vti.Scalar:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TU_MU)>; } } } @@ -1966,16 +1966,16 @@ multiclass VPatAVGADDVL_VV_VX_RM { let Predicates = GetVTypePredicates.Predicates in { def : Pat<(vop (vti.Vector vti.RegClass:$rs1), (vti.Vector vti.RegClass:$rs2), - vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag), + vti.RegClass:$passthru, (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVAADD"#suffix#"_VV_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs2, - (vti.Mask V0), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(vop (vti.Vector vti.RegClass:$rs1), (vti.Vector (SplatPat (XLenVT GPR:$rs2))), - vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag), + vti.RegClass:$passthru, (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVAADD"#suffix#"_VX_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, GPR:$rs2, - (vti.Mask V0), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } } @@ -1995,16 +1995,16 @@ foreach vti = AllIntegerVectors in { let Predicates = GetVTypePredicates.Predicates in { def : Pat<(riscv_sub_vl (vti.Vector (SplatPat (XLenVT GPR:$rs2))), (vti.Vector vti.RegClass:$rs1), - vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag), + vti.RegClass:$passthru, (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVRSUB_VX_"# vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, GPR:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_sub_vl (vti.Vector (SplatPat_simm5 simm5:$rs2)), (vti.Vector vti.RegClass:$rs1), - vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag), + vti.RegClass:$passthru, (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVRSUB_VI_"# vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, simm5:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } @@ -2022,22 +2022,22 @@ foreach vtiToWti = AllWidenableIntVectors in { GetVTypePredicates.Predicates) in { def : Pat<(riscv_shl_vl (wti.Vector (riscv_sext_vl_oneuse (vti.Vector vti.RegClass:$rs1), - (vti.Mask V0), VLOpFrag)), + (vti.Mask VMV0:$vm), VLOpFrag)), (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), 1, VLOpFrag)), - wti.RegClass:$passthru, (vti.Mask V0), VLOpFrag), + wti.RegClass:$passthru, (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWADD_VV_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_shl_vl (wti.Vector (riscv_zext_vl_oneuse (vti.Vector vti.RegClass:$rs1), - (vti.Mask V0), VLOpFrag)), + (vti.Mask VMV0:$vm), VLOpFrag)), (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), 1, VLOpFrag)), - wti.RegClass:$passthru, (vti.Mask V0), VLOpFrag), + wti.RegClass:$passthru, (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWADDU_VV_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } @@ -2096,11 +2096,11 @@ foreach vtiTowti = AllWidenableIntVectors in { let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in def : Pat<(vti.Vector (riscv_trunc_vector_vl (wti.Vector wti.RegClass:$rs1), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVNSRL_WI_"#vti.LMul.MX#"_MASK") (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; } // 11.8. Vector Integer Comparison Instructions @@ -2187,41 +2187,41 @@ foreach vtiTowti = AllWidenableIntVectors in { def : Pat<(riscv_vwmaccsu_vl (vti.Vector vti.RegClass:$rs1), (SplatPat XLenVT:$rs2), (wti.Vector wti.RegClass:$rd), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWMACCUS_VX_"#vti.LMul.MX#"_MASK") wti.RegClass:$rd, vti.ScalarRegClass:$rs2, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } // 11.15. Vector Integer Merge Instructions foreach vti = AllIntegerVectors in { let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0), + def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask VMV0:$vm), vti.RegClass:$rs1, vti.RegClass:$rs2, vti.RegClass:$passthru, VLOpFrag)), (!cast("PseudoVMERGE_VVM_"#vti.LMul.MX) vti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>; - def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0), + def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask VMV0:$vm), (SplatPat XLenVT:$rs1), vti.RegClass:$rs2, vti.RegClass:$passthru, VLOpFrag)), (!cast("PseudoVMERGE_VXM_"#vti.LMul.MX) vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>; - def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0), + def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask VMV0:$vm), (SplatPat_simm5 simm5:$rs1), vti.RegClass:$rs2, vti.RegClass:$passthru, VLOpFrag)), (!cast("PseudoVMERGE_VIM_"#vti.LMul.MX) vti.RegClass:$passthru, vti.RegClass:$rs2, simm5:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>; } } @@ -2271,17 +2271,17 @@ foreach vtiTowti = AllWidenableIntVectors in { GetVTypePredicates.Predicates) in { // Rounding mode here is arbitrary since we aren't shifting out any bits. def : Pat<(vti.Vector (riscv_trunc_vector_vl_ssat (wti.Vector wti.RegClass:$rs1), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVNCLIP_WI_"#vti.LMul.MX#"_MASK") (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0, - (vti.Mask V0), /*RNU*/0, GPR:$vl, vti.Log2SEW, TA_MA)>; + (vti.Mask VMV0:$vm), /*RNU*/0, GPR:$vl, vti.Log2SEW, TA_MA)>; def : Pat<(vti.Vector (riscv_trunc_vector_vl_usat (wti.Vector wti.RegClass:$rs1), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVNCLIPU_WI_"#vti.LMul.MX#"_MASK") (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0, - (vti.Mask V0), /*RNU*/0, GPR:$vl, vti.Log2SEW, TA_MA)>; + (vti.Mask VMV0:$vm), /*RNU*/0, GPR:$vl, vti.Log2SEW, TA_MA)>; } } @@ -2349,39 +2349,39 @@ defm : VPatFPSetCCVL_VV_VF_FV.Predicates in { // 13.8. Vector Floating-Point Square-Root Instruction - def : Pat<(any_riscv_fsqrt_vl (vti.Vector vti.RegClass:$rs2), (vti.Mask V0), + def : Pat<(any_riscv_fsqrt_vl (vti.Vector vti.RegClass:$rs2), (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVFSQRT_V_"# vti.LMul.MX # "_E" # vti.SEW # "_MASK") (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2, - (vti.Mask V0), + (vti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, GPR:$vl, vti.Log2SEW, TA_MA)>; // 13.12. Vector Floating-Point Sign-Injection Instructions - def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask V0), + def : Pat<(riscv_fabs_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVFSGNJX_VV_"# vti.LMul.MX #"_E"#vti.SEW#"_MASK") (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs, - vti.RegClass:$rs, (vti.Mask V0), GPR:$vl, vti.Log2SEW, + vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; // Handle fneg with VFSGNJN using the same input for both operands. - def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask V0), + def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVFSGNJN_VV_"# vti.LMul.MX#"_E"#vti.SEW #"_MASK") (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs, - vti.RegClass:$rs, (vti.Mask V0), GPR:$vl, vti.Log2SEW, + vti.RegClass:$rs, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), (vti.Vector vti.RegClass:$rs2), vti.RegClass:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVFSGNJ_VV_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, - vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW, + vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), @@ -2398,26 +2398,26 @@ foreach vti = AllFloatVectors in { def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1), (SplatFPOp vti.ScalarRegClass:$rs2), vti.RegClass:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVFSGNJ_V"#vti.ScalarSuffix#"_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, - vti.ScalarRegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW, + vti.ScalarRegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; // Rounding without exception to implement nearbyint. def : Pat<(any_riscv_vfround_noexcept_vl (vti.Vector vti.RegClass:$rs1), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVFROUND_NOEXCEPT_V_" # vti.LMul.MX #"_MASK") (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; // 14.14. Vector Floating-Point Classify Instruction def : Pat<(riscv_fclass_vl (vti.Vector vti.RegClass:$rs2), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVFCLASS_V_"# vti.LMul.MX #"_MASK") (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TA_MA)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MA)>; } } @@ -2427,39 +2427,39 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { // 13.15. Vector Floating-Point Merge Instruction defvar ivti = GetIntVTypeInfo.Vti; let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1, fvti.RegClass:$rs2, fvti.RegClass:$passthru, VLOpFrag)), (!cast("PseudoVMERGE_VVM_"#fvti.LMul.MX) - fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0), + fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; - def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), fvti.RegClass:$rs2, fvti.RegClass:$passthru, VLOpFrag)), (!cast("PseudoVMERGE_VXM_"#fvti.LMul.MX) - fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0), + fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; - def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), (SplatFPOp (fvti.Scalar fpimm0)), fvti.RegClass:$rs2, fvti.RegClass:$passthru, VLOpFrag)), (!cast("PseudoVMERGE_VIM_"#fvti.LMul.MX) - fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, (fvti.Mask V0), + fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; } } foreach fvti = AllFloatVectors in { let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), (SplatFPOp fvti.ScalarRegClass:$rs1), fvti.RegClass:$rs2, fvti.RegClass:$passthru, @@ -2467,7 +2467,7 @@ foreach fvti = AllFloatVectors in { (!cast("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX) fvti.RegClass:$passthru, fvti.RegClass:$rs2, (fvti.Scalar fvti.ScalarRegClass:$rs1), - (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; } } @@ -2529,11 +2529,11 @@ foreach fvtiToFWti = AllWidenableFloatVectors in { GetVTypePredicates.Predicates)) in def : Pat<(fwti.Vector (any_riscv_fpextend_vl (fvti.Vector fvti.RegClass:$rs1), - (fvti.Mask V0), + (fvti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVFWCVT_F_F_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, - (fvti.Mask V0), + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TA_MA)>; } @@ -2543,11 +2543,11 @@ foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in { let Predicates = [HasVInstructionsBF16Minimal] in def : Pat<(fwti.Vector (any_riscv_fpextend_vl (fvti.Vector fvti.RegClass:$rs1), - (fvti.Mask V0), + (fvti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVFWCVTBF16_F_F_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, - (fvti.Mask V0), + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TA_MA)>; } @@ -2573,10 +2573,10 @@ foreach fvtiToFWti = AllWidenableFloatVectors in { GetVTypePredicates.Predicates)) in { def : Pat<(fvti.Vector (any_riscv_fpround_vl (fwti.Vector fwti.RegClass:$rs1), - (fwti.Mask V0), VLOpFrag)), + (fwti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, - (fwti.Mask V0), + (fwti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, @@ -2586,10 +2586,10 @@ foreach fvtiToFWti = AllWidenableFloatVectors in { GetVTypePredicates.Predicates) in def : Pat<(fvti.Vector (any_riscv_fncvt_rod_vl (fwti.Vector fwti.RegClass:$rs1), - (fwti.Mask V0), VLOpFrag)), + (fwti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVFNCVT_ROD_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, - (fwti.Mask V0), GPR:$vl, fvti.Log2SEW, TA_MA)>; + (fwti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW, TA_MA)>; } } @@ -2599,10 +2599,10 @@ foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in { let Predicates = [HasVInstructionsBF16Minimal] in def : Pat<(fvti.Vector (any_riscv_fpround_vl (fwti.Vector fwti.RegClass:$rs1), - (fwti.Mask V0), VLOpFrag)), + (fwti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK") (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, - (fwti.Mask V0), + (fwti.Mask VMV0:$vm), // Value to indicate no rounding mode change in // RISCVInsertReadWriteCSR FRM_DYN, @@ -2705,20 +2705,20 @@ foreach mti = AllMasks in { VLOpFrag)), (!cast("PseudoVCPOP_M_" # mti.BX) VR:$rs2, GPR:$vl, mti.Log2SEW)>; - def : Pat<(XLenVT (riscv_vcpop_vl (mti.Mask VR:$rs2), (mti.Mask V0), + def : Pat<(XLenVT (riscv_vcpop_vl (mti.Mask VR:$rs2), (mti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVCPOP_M_" # mti.BX # "_MASK") - VR:$rs2, (mti.Mask V0), GPR:$vl, mti.Log2SEW)>; + VR:$rs2, (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW)>; // 15.3 vfirst find-first-set mask bit def : Pat<(XLenVT (riscv_vfirst_vl (mti.Mask VR:$rs2), (mti.Mask true_mask), VLOpFrag)), (!cast("PseudoVFIRST_M_" # mti.BX) VR:$rs2, GPR:$vl, mti.Log2SEW)>; - def : Pat<(XLenVT (riscv_vfirst_vl (mti.Mask VR:$rs2), (mti.Mask V0), + def : Pat<(XLenVT (riscv_vfirst_vl (mti.Mask VR:$rs2), (mti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVFIRST_M_" # mti.BX # "_MASK") - VR:$rs2, (mti.Mask V0), GPR:$vl, mti.Log2SEW)>; + VR:$rs2, (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW)>; } } @@ -2741,26 +2741,26 @@ foreach vti = AllIntegerVectors in { def : Pat<(vti.Vector (riscv_vrgather_vv_vl vti.RegClass:$rs2, vti.RegClass:$rs1, vti.RegClass:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVRGATHER_VV_"# vti.LMul.MX#"_E"# vti.SEW#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1, vti.RegClass:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVRGATHER_VX_"# vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, uimm5:$imm, vti.RegClass:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$imm, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } // emul = lmul * 16 / sew @@ -2776,11 +2776,11 @@ foreach vti = AllIntegerVectors in { (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2, (ivti.Vector ivti.RegClass:$rs1), vti.RegClass:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast(inst#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, ivti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } @@ -2811,27 +2811,27 @@ foreach vti = !listconcat(AllFloatVectors, AllBFloatVectors) in { (riscv_vrgather_vv_vl vti.RegClass:$rs2, (ivti.Vector vti.RegClass:$rs1), vti.RegClass:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVRGATHER_VV_"# vti.LMul.MX#"_E"# vti.SEW#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1, vti.RegClass:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVRGATHER_VX_"# vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, uimm5:$imm, vti.RegClass:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$imm, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } defvar vlmul = vti.LMul; @@ -2847,11 +2847,11 @@ foreach vti = !listconcat(AllFloatVectors, AllBFloatVectors) in { (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2, (ivti.Vector ivti.RegClass:$rs1), vti.RegClass:$passthru, - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast(inst#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, ivti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } @@ -2888,10 +2888,10 @@ def riscv_fslide1down_vl : SDNode<"RISCVISD::VFSLIDE1DOWN_VL", SDTRVVFSlide1, [ foreach vti = AllIntegerVectors in { let Predicates = GetVTypePredicates.Predicates in { - def : Pat<(vti.Vector (riscv_vid_vl (vti.Mask V0), + def : Pat<(vti.Vector (riscv_vid_vl (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVID_V_"#vti.LMul.MX#"_MASK") - (vti.Vector (IMPLICIT_DEF)), (vti.Mask V0), GPR:$vl, vti.Log2SEW, + (vti.Vector (IMPLICIT_DEF)), (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td index 430d75e5cec5b..470555769d493 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td @@ -716,12 +716,12 @@ multiclass VPatUnaryVL_V.Predicates) in { def : Pat<(vti.Vector (op (vti.Vector vti.RegClass:$rs1), (vti.Vector vti.RegClass:$passthru), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast(instruction_name#"_V_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs1, - (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; @@ -736,17 +736,17 @@ foreach vti = AllIntegerVectors in { (vti.Vector vti.RegClass:$rs1), (riscv_splat_vector -1), (vti.Vector vti.RegClass:$passthru), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag), (vti.Vector vti.RegClass:$rs2), (vti.Vector vti.RegClass:$passthru), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVANDN_VV_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1, - (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; @@ -755,13 +755,13 @@ foreach vti = AllIntegerVectors in { (not vti.ScalarRegClass:$rs1)), (vti.Vector vti.RegClass:$rs2), (vti.Vector vti.RegClass:$passthru), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVANDN_VX_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, vti.ScalarRegClass:$rs1, - (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; @@ -769,13 +769,13 @@ foreach vti = AllIntegerVectors in { def : Pat<(vti.Vector (riscv_and_vl (riscv_splat_vector invLogicImm:$rs1), (vti.Vector vti.RegClass:$rs2), (vti.Vector vti.RegClass:$passthru), - (vti.Mask V0), + (vti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVANDN_VX_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, invLogicImm:$rs1, - (vti.Mask V0), + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; @@ -797,12 +797,12 @@ foreach vti = AllIntegerVectors in { def : Pat<(riscv_rotl_vl vti.RegClass:$rs2, (vti.Vector (SplatPat_uimm6 uimm6:$rs1)), (vti.Vector vti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVROR_VI_"#vti.LMul.MX#"_MASK") vti.RegClass:$passthru, vti.RegClass:$rs2, (!cast("InvRot" # vti.SEW # "Imm") uimm6:$rs1), - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } defm : VPatBinaryVL_VV_VX_VI; @@ -817,90 +817,90 @@ foreach vtiToWti = AllWidenableIntVectors in { (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))), (wti.Vector (ext_oneuse (vti.Vector vti.RegClass:$rs1))), (wti.Vector wti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_shl_vl (wti.Vector (riscv_zext_vl_oneuse (vti.Vector vti.RegClass:$rs2), - (vti.Mask V0), VLOpFrag)), + (vti.Mask VMV0:$vm), VLOpFrag)), (wti.Vector (riscv_ext_vl_oneuse (vti.Vector vti.RegClass:$rs1), - (vti.Mask V0), VLOpFrag)), + (vti.Mask VMV0:$vm), VLOpFrag)), (wti.Vector wti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_shl_vl (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))), (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))), (wti.Vector wti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_shl_vl (wti.Vector (riscv_zext_vl_oneuse (vti.Vector vti.RegClass:$rs2), - (vti.Mask V0), VLOpFrag)), + (vti.Mask VMV0:$vm), VLOpFrag)), (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))), (wti.Vector wti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_shl_vl (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))), (wti.Vector (SplatPat_uimm5 uimm5:$rs1)), (wti.Vector wti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_shl_vl (wti.Vector (riscv_zext_vl_oneuse (vti.Vector vti.RegClass:$rs2), - (vti.Mask V0), VLOpFrag)), + (vti.Mask VMV0:$vm), VLOpFrag)), (wti.Vector (SplatPat_uimm5 uimm5:$rs1)), (wti.Vector wti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_vwsll_vl (vti.Vector vti.RegClass:$rs2), (vti.Vector vti.RegClass:$rs1), (wti.Vector wti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_vwsll_vl (vti.Vector vti.RegClass:$rs2), (vti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))), (wti.Vector wti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; def : Pat<(riscv_vwsll_vl (vti.Vector vti.RegClass:$rs2), (vti.Vector (SplatPat_uimm5 uimm5:$rs1)), (wti.Vector wti.RegClass:$passthru), - (vti.Mask V0), VLOpFrag), + (vti.Mask VMV0:$vm), VLOpFrag), (!cast("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK") wti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$rs1, - (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; } } @@ -1046,12 +1046,12 @@ multiclass VPatBinaryV_VI_VROL; } } diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index b5eea138732a5..05fcbfd42b092 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -89,20 +89,20 @@ class RISCVTuneProcessorModel, GenericTuneInfo; def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64", - NoSchedModel, + GenericModel, [Feature64Bit, FeatureStdExtI], GenericTuneFeatures>, GenericTuneInfo; // Support generic for compatibility with other targets. The triple will be used // to change to the appropriate rv32/rv64 version. -def GENERIC : RISCVTuneProcessorModel<"generic", NoSchedModel>, GenericTuneInfo; +def GENERIC : RISCVTuneProcessorModel<"generic", GenericModel>, GenericTuneInfo; def MIPS_P8700 : RISCVProcessorModel<"mips-p8700", MIPSP8700Model, @@ -496,7 +496,7 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8", TunePostRAScheduler]>; def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1", - NoSchedModel, + GenericModel, [Feature64Bit, FeatureStdExtI, FeatureStdExtZifencei, @@ -556,7 +556,7 @@ def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu", TuneShiftedZExtWFusion]>; def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60", - NoSchedModel, + GenericModel, !listconcat(RVA22S64Features, [FeatureStdExtV, FeatureStdExtSscofpmf, @@ -581,7 +581,7 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60", } def RP2350_HAZARD3 : RISCVProcessorModel<"rp2350-hazard3", - NoSchedModel, + GenericModel, [Feature32Bit, FeatureStdExtI, FeatureStdExtM, diff --git a/llvm/lib/Target/RISCV/RISCVSchedGeneric.td b/llvm/lib/Target/RISCV/RISCVSchedGeneric.td new file mode 100644 index 0000000000000..a10cf624aa3a4 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVSchedGeneric.td @@ -0,0 +1,18 @@ +//===-- RISCVSchedGeneric.td - Generic In-order Processor --*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Adjust the default cost model to enable all scheduling heuristics, not just +// latency. +// +// In particular, this enables register pressure heuristics which are very +// important for vector code with high LMULs, and have little negative impact +// on other configurations. +def GenericModel : SchedMachineModel { + let MicroOpBufferSize = 1; + let CompleteModel = 0; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 4a69bdeb76161..167dbb53c5950 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -137,6 +137,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVExpandPseudoPass(*PR); initializeRISCVVectorPeepholePass(*PR); initializeRISCVVLOptimizerPass(*PR); + initializeRISCVVMV0EliminationPass(*PR); initializeRISCVInsertVSETVLIPass(*PR); initializeRISCVInsertReadWriteCSRPass(*PR); initializeRISCVInsertWriteVXRMPass(*PR); @@ -587,6 +588,8 @@ void RISCVPassConfig::addPreEmitPass2() { void RISCVPassConfig::addMachineSSAOptimization() { addPass(createRISCVVectorPeepholePass()); + // TODO: Move this to pre regalloc + addPass(createRISCVVMV0EliminationPass()); TargetPassConfig::addMachineSSAOptimization(); @@ -599,6 +602,10 @@ void RISCVPassConfig::addMachineSSAOptimization() { } void RISCVPassConfig::addPreRegAlloc() { + // TODO: Move this as late as possible before regalloc + if (TM->getOptLevel() == CodeGenOptLevel::None) + addPass(createRISCVVMV0EliminationPass()); + addPass(createRISCVPreRAExpandPseudoPass()); if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createRISCVMergeBaseOffsetOptPass()); diff --git a/llvm/lib/Target/RISCV/RISCVVMV0Elimination.cpp b/llvm/lib/Target/RISCV/RISCVVMV0Elimination.cpp new file mode 100644 index 0000000000000..efd92c55e3adf --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVVMV0Elimination.cpp @@ -0,0 +1,172 @@ +//===- RISCVVMV0Elimination.cpp - VMV0 Elimination -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// Mask operands in vector pseudos have to be in v0. We select them as a virtual +// register in the singleton vmv0 register class instead of copying them to $v0 +// straight away, to make optimizing masks easier. +// +// However register coalescing may end up coleascing copies into vmv0, resulting +// in instructions with multiple uses of vmv0 that the register allocator can't +// allocate: +// +// %x:vrnov0 = PseudoVADD_VV_M1_MASK %0:vrnov0, %1:vr, %2:vmv0, %3:vmv0, ... +// +// To avoid this, this pass replaces any uses* of vmv0 with copies to $v0 before +// register coalescing and allocation: +// +// %x:vrnov0 = PseudoVADD_VV_M1_MASK %0:vrnov0, %1:vr, %2:vr, %3:vmv0, ... +// -> +// $v0 = COPY %3:vr +// %x:vrnov0 = PseudoVADD_VV_M1_MASK %0:vrnov0, %1:vr, %2:vr, $0, ... +// +// * The only uses of vmv0 left behind are when used for inline asm with the vm +// constraint. +// +//===---------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVSubtarget.h" +#ifndef NDEBUG +#include "llvm/ADT/PostOrderIterator.h" +#endif +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-vmv0-elimination" + +namespace { + +class RISCVVMV0Elimination : public MachineFunctionPass { +public: + static char ID; + RISCVVMV0Elimination() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + MachineFunctionProperties getRequiredProperties() const override { + // TODO: We could move this closer to regalloc, out of SSA, which would + // allow scheduling past mask operands. We would need to preserve live + // intervals. + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } +}; + +} // namespace + +char RISCVVMV0Elimination::ID = 0; + +INITIALIZE_PASS(RISCVVMV0Elimination, DEBUG_TYPE, "RISC-V VMV0 Elimination", + false, false) + +FunctionPass *llvm::createRISCVVMV0EliminationPass() { + return new RISCVVMV0Elimination(); +} + +static bool isVMV0(const MCOperandInfo &MCOI) { + return MCOI.RegClass == RISCV::VMV0RegClassID; +} + +bool RISCVVMV0Elimination::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + // Skip if the vector extension is not enabled. + const RISCVSubtarget *ST = &MF.getSubtarget(); + if (!ST->hasVInstructions()) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = ST->getInstrInfo(); + +#ifndef NDEBUG + // Assert that we won't clobber any existing reads of v0 where we need to + // insert copies. + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + ReversePostOrderTraversal RPOT(&*MF.begin()); + for (MachineBasicBlock *MBB : RPOT) { + bool V0Clobbered = false; + for (MachineInstr &MI : *MBB) { + assert(!(MI.readsRegister(RISCV::V0, TRI) && V0Clobbered) && + "Inserting a copy to v0 would clobber a read"); + if (MI.modifiesRegister(RISCV::V0, TRI)) + V0Clobbered = false; + + if (any_of(MI.getDesc().operands(), isVMV0)) + V0Clobbered = true; + } + + assert(!(V0Clobbered && + any_of(MBB->successors(), + [](auto *Succ) { return Succ->isLiveIn(RISCV::V0); })) && + "Clobbered a v0 used in a successor"); + } +#endif + + bool MadeChange = false; + + // For any instruction with a vmv0 operand, replace it with a copy to v0. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + assert(count_if(MI.getDesc().operands(), isVMV0) < 2 && + "Expected only one or zero vmv0 operands"); + + for (auto [OpNo, MCOI] : enumerate(MI.getDesc().operands())) { + if (isVMV0(MCOI)) { + MachineOperand &MO = MI.getOperand(OpNo); + Register Src = MO.getReg(); + assert(MO.isUse() && MO.getSubReg() == RISCV::NoSubRegister && + Src.isVirtual() && "vmv0 use in unexpected form"); + + // Peek through a single copy to match what isel does. + if (MachineInstr *SrcMI = MRI.getVRegDef(Src); + SrcMI->isCopy() && SrcMI->getOperand(1).getReg().isVirtual()) { + assert(SrcMI->getOperand(1).getSubReg() == RISCV::NoSubRegister); + Src = SrcMI->getOperand(1).getReg(); + } + + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(RISCV::COPY), RISCV::V0) + .addReg(Src); + + MO.setReg(RISCV::V0); + MadeChange = true; + break; + } + } + } + } + + if (!MadeChange) + return false; + + // Now that any constraints requiring vmv0 are gone, eliminate any uses of + // vmv0 by recomputing the reg class. + // The only remaining uses should be around inline asm. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (MachineOperand &MO : MI.uses()) { + if (MO.isReg() && MO.getReg().isVirtual() && + MRI.getRegClass(MO.getReg()) == &RISCV::VMV0RegClass) { + MRI.recomputeRegClass(MO.getReg()); + assert(MRI.getRegClass(MO.getReg()) != &RISCV::VMV0RegClass || + MI.isInlineAsm() || + MRI.getVRegDef(MO.getReg())->isInlineAsm() && + "Non-inline-asm use of vmv0 left behind"); + } + } + } + } + + return true; +} diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index bb2d1717c3b1e..a4e7219c39f37 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -73,9 +73,7 @@ class RISCVVectorPeephole : public MachineFunctionPass { bool isAllOnesMask(const MachineInstr *MaskDef) const; std::optional getConstant(const MachineOperand &VL) const; bool ensureDominates(const MachineOperand &Use, MachineInstr &Src) const; - - /// Maps uses of V0 to the corresponding def of V0. - DenseMap V0Defs; + bool isKnownSameDefs(const MachineOperand &A, const MachineOperand &B) const; }; } // namespace @@ -268,14 +266,8 @@ bool RISCVVectorPeephole::convertToVLMAX(MachineInstr &MI) const { } bool RISCVVectorPeephole::isAllOnesMask(const MachineInstr *MaskDef) const { - assert(MaskDef && MaskDef->isCopy() && - MaskDef->getOperand(0).getReg() == RISCV::V0); - Register SrcReg = TRI->lookThruCopyLike(MaskDef->getOperand(1).getReg(), MRI); - if (!SrcReg.isVirtual()) - return false; - MaskDef = MRI->getVRegDef(SrcReg); - if (!MaskDef) - return false; + while (MaskDef->isCopy() && MaskDef->getOperand(1).getReg().isVirtual()) + MaskDef = MRI->getVRegDef(MaskDef->getOperand(1).getReg()); // TODO: Check that the VMSET is the expected bitwidth? The pseudo has // undefined behaviour if it's the wrong bitwidth, so we could choose to @@ -372,8 +364,7 @@ bool RISCVVectorPeephole::convertAllOnesVMergeToVMv(MachineInstr &MI) const { unsigned NewOpc = getVMV_V_VOpcodeForVMERGE_VVM(MI); if (!NewOpc) return false; - assert(MI.getOperand(4).isReg() && MI.getOperand(4).getReg() == RISCV::V0); - if (!isAllOnesMask(V0Defs.lookup(&MI))) + if (!isAllOnesMask(MRI->getVRegDef(MI.getOperand(4).getReg()))) return false; MI.setDesc(TII->get(NewOpc)); @@ -390,6 +381,15 @@ bool RISCVVectorPeephole::convertAllOnesVMergeToVMv(MachineInstr &MI) const { return true; } +bool RISCVVectorPeephole::isKnownSameDefs(const MachineOperand &A, + const MachineOperand &B) const { + if (A.getReg().isPhysical() || B.getReg().isPhysical()) + return false; + + return TRI->lookThruCopyLike(A.getReg(), MRI) == + TRI->lookThruCopyLike(B.getReg(), MRI); +} + /// If a PseudoVMERGE_VVM's true operand is a masked pseudo and both have the /// same mask, and the masked pseudo's passthru is the same as the false /// operand, we can convert the PseudoVMERGE_VVM to a PseudoVMV_V_V. @@ -404,14 +404,18 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) { if (!NewOpc) return false; MachineInstr *True = MRI->getVRegDef(MI.getOperand(3).getReg()); - if (!True || True->getParent() != MI.getParent() || - !RISCV::getMaskedPseudoInfo(True->getOpcode()) || !hasSameEEW(MI, *True)) + + if (!True || True->getParent() != MI.getParent()) return false; - const MachineInstr *TrueV0Def = V0Defs.lookup(True); - const MachineInstr *MIV0Def = V0Defs.lookup(&MI); - assert(TrueV0Def && TrueV0Def->isCopy() && MIV0Def && MIV0Def->isCopy()); - if (TrueV0Def->getOperand(1).getReg() != MIV0Def->getOperand(1).getReg()) + auto *TrueMaskedInfo = RISCV::getMaskedPseudoInfo(True->getOpcode()); + if (!TrueMaskedInfo || !hasSameEEW(MI, *True)) + return false; + + const MachineOperand &TrueMask = + True->getOperand(TrueMaskedInfo->MaskOpIdx + True->getNumExplicitDefs()); + const MachineOperand &MIMask = MI.getOperand(4); + if (!isKnownSameDefs(TrueMask, MIMask)) return false; // True's passthru needs to be equivalent to False @@ -450,7 +454,8 @@ bool RISCVVectorPeephole::convertToUnmasked(MachineInstr &MI) const { if (!I) return false; - if (!isAllOnesMask(V0Defs.lookup(&MI))) + if (!isAllOnesMask(MRI->getVRegDef( + MI.getOperand(I->MaskOpIdx + MI.getNumExplicitDefs()).getReg()))) return false; // There are two classes of pseudos in the table - compares and @@ -575,7 +580,6 @@ bool RISCVVectorPeephole::foldUndefPassthruVMV_V_V(MachineInstr &MI) { MRI->replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(2).getReg()); MI.eraseFromParent(); - V0Defs.erase(&MI); return true; } @@ -645,7 +649,6 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { MRI->replaceRegWith(MI.getOperand(0).getReg(), Src->getOperand(0).getReg()); MI.eraseFromParent(); - V0Defs.erase(&MI); return true; } @@ -665,24 +668,6 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; - // Masked pseudos coming out of isel will have their mask operand in the form: - // - // $v0:vr = COPY %mask:vr - // %x:vr = Pseudo_MASK %a:vr, %b:br, $v0:vr - // - // Because $v0 isn't in SSA, keep track of its definition at each use so we - // can check mask operands. - for (const MachineBasicBlock &MBB : MF) { - const MachineInstr *CurrentV0Def = nullptr; - for (const MachineInstr &MI : MBB) { - if (MI.readsRegister(RISCV::V0, TRI)) - V0Defs[&MI] = CurrentV0Def; - - if (MI.definesRegister(RISCV::V0, TRI)) - CurrentV0Def = &MI; - } - } - for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : make_early_inc_range(MBB)) { Changed |= convertToVLMAX(MI); diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 719abde3d7fa6..4f3cc9ada04cf 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -93,7 +93,7 @@ class SPIRVEmitIntrinsics void eraseTodoType(Value *Op) { auto It = TodoType.find(Op); if (It != TodoType.end() && It->second) { - TodoType[Op] = false; + It->second = false; --TodoTypeSz; } } diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp index d307c73a87fc9..589dd8b634125 100644 --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -688,16 +688,17 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) { int64_t SrcDisp = MI->getOperand(5).getImm(); SystemZTargetStreamer *TS = getTargetStreamer(); - MCSymbol *DotSym = nullptr; - MCInst ET = MCInstBuilder(TargetInsOpc).addReg(DestReg) - .addImm(DestDisp).addImm(1).addReg(SrcReg).addImm(SrcDisp); + MCInst ET = MCInstBuilder(TargetInsOpc) + .addReg(DestReg) + .addImm(DestDisp) + .addImm(1) + .addReg(SrcReg) + .addImm(SrcDisp); SystemZTargetStreamer::MCInstSTIPair ET_STI(ET, &MF->getSubtarget()); - SystemZTargetStreamer::EXRLT2SymMap::iterator I = - TS->EXRLTargets2Sym.find(ET_STI); - if (I != TS->EXRLTargets2Sym.end()) - DotSym = I->second; - else - TS->EXRLTargets2Sym[ET_STI] = DotSym = OutContext.createTempSymbol(); + auto [It, Inserted] = TS->EXRLTargets2Sym.try_emplace(ET_STI); + if (Inserted) + It->second = OutContext.createTempSymbol(); + MCSymbol *DotSym = It->second; const MCSymbolRefExpr *Dot = MCSymbolRefExpr::create(DotSym, OutContext); EmitToStreamer( *OutStreamer, diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index f693ef3dbf962..2a4e2c897b18d 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -1246,7 +1246,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser { if (Group) WasmSym->setComdat(true); auto *WS = getContext().getWasmSection(SecName, SectionKind::getText(), 0, - Group, MCContext::GenericSectionID); + Group, MCSection::NonUniqueID); getStreamer().switchSection(WS); // Also generate DWARF for this section if requested. if (getContext().getGenDwarfForAssembly()) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h index 8cb692f9bc0c4..474f04628066b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -27,12 +27,6 @@ namespace llvm { -namespace WebAssembly { - -int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); - -} - class WebAssemblySubtarget; class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo { diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index a3547f802976b..25f81834bfa5e 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -451,8 +451,8 @@ bool X86DomainReassignment::visitRegister(Closure &C, Register Reg, } bool X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) { - auto I = EnclosedInstrs.find(MI); - if (I != EnclosedInstrs.end()) { + auto [I, Inserted] = EnclosedInstrs.try_emplace(MI, C.getID()); + if (!Inserted) { if (I->second != C.getID()) { // Instruction already belongs to another closure, avoid conflicts between // closure and mark this closure as illegal. @@ -462,7 +462,6 @@ bool X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) { return true; } - EnclosedInstrs[MI] = C.getID(); C.addInstruction(MI); // Mark closure as illegal for reassignment to domains, if there is no diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8c28985c8e8e7..4fd07bf11bd83 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9800,6 +9800,24 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, MaskSize == (int)ExpectedOp.getNumOperands()) return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx); break; + case ISD::BITCAST: + if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) { + SDValue Src = peekThroughBitcasts(Op); + EVT SrcVT = Src.getValueType(); + if (SrcVT.isVector() && + (SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) { + unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits(); + return (Idx % Scale) == (ExpectedIdx % Scale) && + IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src, + Idx / Scale, ExpectedIdx / Scale); + } + } + break; + case ISD::VECTOR_SHUFFLE: { + auto *SVN = cast(Op); + return Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize && + SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx); + } case X86ISD::VBROADCAST: case X86ISD::VBROADCAST_LOAD: // TODO: Handle MaskSize != VT.getVectorNumElements()? @@ -12462,7 +12480,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx, /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { - return V->hasOneUse() && + return V.hasOneUse() && ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode()); } @@ -12780,8 +12798,13 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, // Check that the mask is a broadcast. int BroadcastIdx = getSplatIndex(Mask); - if (BroadcastIdx < 0) - return SDValue(); + if (BroadcastIdx < 0) { + // Check for hidden broadcast. + SmallVector BroadcastMask(VT.getVectorNumElements(), 0); + if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2)) + return SDValue(); + BroadcastIdx = 0; + } assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); @@ -41556,8 +41579,7 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, EVT ShuffleVT = N.getValueType(); unsigned Opc = N.getOpcode(); - auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true, - bool FoldLoad = false) { + auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) { // AllZeros/AllOnes constants are freely shuffled and will peek through // bitcasts. Other constant build vectors do not peek through bitcasts. Only // merge with target shuffles if it has one use so shuffle combining is @@ -41570,7 +41592,6 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, (Op.getOpcode() == Opc && Op->hasOneUse()) || (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) || (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) || - (FoldLoad && isShuffleFoldableLoad(Op)) || DAG.isSplatValue(Op, /*AllowUndefs*/ false); }; auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) { @@ -41606,9 +41627,8 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1)); bool FoldShuf = Opc != X86ISD::VPERMI; - bool FoldLoad = Opc != X86ISD::PSHUFB; - if (IsMergeableWithShuffle(Op00, FoldShuf, FoldLoad) || - IsMergeableWithShuffle(Op01, FoldShuf, FoldLoad)) { + if (IsMergeableWithShuffle(Op00, FoldShuf) || + IsMergeableWithShuffle(Op01, FoldShuf)) { SDValue LHS, RHS; Op00 = DAG.getBitcast(ShuffleVT, Op00); Op01 = DAG.getBitcast(ShuffleVT, Op01); @@ -42506,9 +42526,9 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, DAG.getVectorIdxConstant(0, DL)); } } - SmallVector Ops; + SmallVector SrcOps; SmallVector Mask; - if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) { + if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) { assert(Mask.size() == NumElts && "Unexpected shuffle mask size"); SDValue V1 = peekThroughBitcasts(N.getOperand(0)); SDValue V2 = peekThroughBitcasts(N.getOperand(2)); @@ -58485,10 +58505,26 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, DAG.isSplatValue(InVec, /*AllowUndefs*/ false))) return extractSubVector(InVec, 0, DAG, DL, SizeInBits); - // If we're extracting a broadcasted subvector, just use the lowest subvector. - if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && - cast(InVec)->getMemoryVT() == VT) - return extractSubVector(InVec, 0, DAG, DL, SizeInBits); + // Check if we're extracting a whole broadcasted subvector. + if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { + auto *MemIntr = cast(InVec); + EVT MemVT = MemIntr->getMemoryVT(); + if (MemVT == VT) { + // Just use the lowest subvector. + if (IdxVal != 0) + return extractSubVector(InVec, 0, DAG, DL, SizeInBits); + // If this is the only use, we can replace with a regular load (this may + // have been missed by SimplifyDemandedVectorElts due to extra uses of the + // memory chain). + if (InVec.hasOneUse()) { + SDValue Ld = + DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(), + MemIntr->getMemOperand()); + DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1)); + return Ld; + } + } + } // Attempt to extract from the source of a shuffle vector. if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) { diff --git a/llvm/lib/Target/Xtensa/Disassembler/CMakeLists.txt b/llvm/lib/Target/Xtensa/Disassembler/CMakeLists.txt index 43f235b7cd31d..319d912e5561b 100644 --- a/llvm/lib/Target/Xtensa/Disassembler/CMakeLists.txt +++ b/llvm/lib/Target/Xtensa/Disassembler/CMakeLists.txt @@ -4,6 +4,7 @@ add_llvm_component_library(LLVMXtensaDisassembler LINK_COMPONENTS MCDisassembler Support + XtensaDesc XtensaInfo ADD_TO_COMPONENT diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 4104e4e533e9d..62fa3af502e29 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -851,7 +851,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, } else { DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar, DBuilder.createExpression(), DILoc, - &*Shape.getInsertPtAfterFramePtr()); + Shape.getInsertPtAfterFramePtr()); } } @@ -1146,7 +1146,7 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved) .insertDeclare(CurrentReload, DDI->getVariable(), DDI->getExpression(), DDI->getDebugLoc(), - &*Builder.GetInsertPoint()); + Builder.GetInsertPoint()); } // This dbg.declare is for the main function entry point. It // will be deleted in all coro-split functions. diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 17e7fada10827..c1dd8bc393f33 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3970,18 +3970,17 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { // TODO: We should track the capturing uses in AANoCapture but the problem // is CGSCC runs. For those we would need to "allow" AANoCapture for // a value in the module slice. - switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { - case UseCaptureKind::NO_CAPTURE: + // TODO(captures): Make this more precise. + UseCaptureInfo CI = + DetermineUseCaptureKind(U, /*Base=*/nullptr, IsDereferenceableOrNull); + if (capturesNothing(CI)) return true; - case UseCaptureKind::MAY_CAPTURE: - LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *UserI - << "\n"); - return false; - case UseCaptureKind::PASSTHROUGH: + if (CI.isPassthrough()) { Follow = true; return true; } - llvm_unreachable("unknown UseCaptureKind"); + LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *UserI << "\n"); + return false; }; bool IsKnownNoCapture; @@ -6019,16 +6018,16 @@ ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) { }; auto UseCheck = [&](const Use &U, bool &Follow) -> bool { - switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { - case UseCaptureKind::NO_CAPTURE: + // TODO(captures): Make this more precise. + UseCaptureInfo CI = + DetermineUseCaptureKind(U, /*Base=*/nullptr, IsDereferenceableOrNull); + if (capturesNothing(CI)) return true; - case UseCaptureKind::MAY_CAPTURE: - return checkUse(A, T, U, Follow); - case UseCaptureKind::PASSTHROUGH: + if (CI.isPassthrough()) { Follow = true; return true; } - llvm_unreachable("Unexpected use capture kind!"); + return checkUse(A, T, U, Follow); }; if (!A.checkForAllUses(UseCheck, *this, *V)) @@ -12151,16 +12150,13 @@ struct AAGlobalValueInfoFloating : public AAGlobalValueInfo { auto UsePred = [&](const Use &U, bool &Follow) -> bool { Uses.insert(&U); - switch (DetermineUseCaptureKind(U, nullptr)) { - case UseCaptureKind::NO_CAPTURE: - return checkUse(A, U, Follow, Worklist); - case UseCaptureKind::MAY_CAPTURE: - return checkUse(A, U, Follow, Worklist); - case UseCaptureKind::PASSTHROUGH: + // TODO(captures): Make this more precise. + UseCaptureInfo CI = DetermineUseCaptureKind(U, /*Base=*/nullptr, nullptr); + if (CI.isPassthrough()) { Follow = true; return true; } - return true; + return checkUse(A, U, Follow, Worklist); }; auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) { Uses.insert(&OldU); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index cf56f67e4de3f..a66d7ce9c3f50 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -71,7 +71,9 @@ using namespace llvm; #define DEBUG_TYPE "function-attrs" STATISTIC(NumMemoryAttr, "Number of functions with improved memory attribute"); -STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); +STATISTIC(NumCapturesNone, "Number of arguments marked captures(none)"); +STATISTIC(NumCapturesPartial, "Number of arguments marked with captures " + "attribute other than captures(none)"); STATISTIC(NumReturned, "Number of arguments marked returned"); STATISTIC(NumReadNoneArg, "Number of arguments marked readnone"); STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly"); @@ -108,6 +110,13 @@ static cl::opt DisableThinLTOPropagation( "disable-thinlto-funcattrs", cl::init(true), cl::Hidden, cl::desc("Don't propagate function-attrs in thinLTO")); +static void addCapturesStat(CaptureInfo CI) { + if (capturesNothing(CI)) + ++NumCapturesNone; + else + ++NumCapturesPartial; +} + namespace { using SCCNodeSet = SmallSetVector; @@ -132,6 +141,7 @@ static void addLocAccess(MemoryEffects &ME, const MemoryLocation &Loc, // If it's not an identified object, it might be an argument. if (!isIdentifiedObject(UO)) ME |= MemoryEffects::argMemOnly(MR); + ME |= MemoryEffects(IRMemLocation::ErrnoMem, MR); ME |= MemoryEffects(IRMemLocation::Other, MR); } @@ -210,6 +220,9 @@ checkFunctionMemoryAccess(Function &F, bool ThisBody, AAResults &AAR, if (isa(I)) continue; + // Merge callee's memory effects into caller's ones, including + // inaccessible and errno memory, but excluding argument memory, which is + // handled separately. ME |= CallME.getWithoutLoc(IRMemLocation::ArgMem); // If the call accesses captured memory (currently part of "other") and @@ -494,6 +507,9 @@ namespace { /// SCC of the arguments. struct ArgumentGraphNode { Argument *Definition; + /// CaptureComponents for this argument, excluding captures via Uses. + /// We don't distinguish between other/return captures here. + CaptureComponents CC = CaptureComponents::None; SmallVector Uses; }; @@ -535,18 +551,36 @@ class ArgumentGraph { struct ArgumentUsesTracker : public CaptureTracker { ArgumentUsesTracker(const SCCNodeSet &SCCNodes) : SCCNodes(SCCNodes) {} - void tooManyUses() override { Captured = true; } + void tooManyUses() override { CI = CaptureInfo::all(); } + + Action captured(const Use *U, UseCaptureInfo UseCI) override { + if (updateCaptureInfo(U, UseCI.UseCC)) { + // Don't bother continuing if we already capture everything. + if (capturesAll(CI.getOtherComponents())) + return Stop; + return Continue; + } + + // For SCC argument tracking, we're not going to analyze other/ret + // components separately, so don't follow the return value. + return ContinueIgnoringReturn; + } - bool captured(const Use *U) override { + bool updateCaptureInfo(const Use *U, CaptureComponents CC) { CallBase *CB = dyn_cast(U->getUser()); if (!CB) { - Captured = true; + if (isa(U->getUser())) + CI |= CaptureInfo::retOnly(CC); + else + // Conservatively assume that the captured value might make its way + // into the return value as well. This could be made more precise. + CI |= CaptureInfo(CC); return true; } Function *F = CB->getCalledFunction(); if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) { - Captured = true; + CI |= CaptureInfo(CC); return true; } @@ -560,22 +594,24 @@ struct ArgumentUsesTracker : public CaptureTracker { // use. In this case it does not matter if the callee is within our SCC // or not -- we've been captured in some unknown way, and we have to be // conservative. - Captured = true; + CI |= CaptureInfo(CC); return true; } if (UseIndex >= F->arg_size()) { assert(F->isVarArg() && "More params than args in non-varargs call"); - Captured = true; + CI |= CaptureInfo(CC); return true; } + // TODO(captures): Could improve precision by remembering maximum + // capture components for the argument. Uses.push_back(&*std::next(F->arg_begin(), UseIndex)); return false; } - // True only if certainly captured (used outside our SCC). - bool Captured = false; + // Does not include potential captures via Uses in the SCC. + CaptureInfo CI = CaptureInfo::none(); // Uses within our SCC. SmallVector Uses; @@ -1190,6 +1226,15 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, bool SkipInitializes) { ArgumentGraph AG; + auto DetermineAccessAttrsForSingleton = [](Argument *A) { + SmallPtrSet Self; + Self.insert(A); + Attribute::AttrKind R = determinePointerAccessAttrs(A, Self); + if (R != Attribute::None) + return addAccessAttr(A, R); + return false; + }; + // Check each function in turn, determining which pointer arguments are not // captured. for (Function *F : SCCNodes) { @@ -1210,7 +1255,7 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (A.getType()->isPointerTy() && !A.hasNoCaptureAttr()) { A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), CaptureInfo::none())); - ++NumNoCapture; + ++NumCapturesNone; Changed.insert(F); } } @@ -1221,21 +1266,23 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (!A.getType()->isPointerTy()) continue; bool HasNonLocalUses = false; - if (!A.hasNoCaptureAttr()) { + CaptureInfo OrigCI = A.getAttributes().getCaptureInfo(); + if (!capturesNothing(OrigCI)) { ArgumentUsesTracker Tracker(SCCNodes); PointerMayBeCaptured(&A, &Tracker); - if (!Tracker.Captured) { + CaptureInfo NewCI = Tracker.CI & OrigCI; + if (NewCI != OrigCI) { if (Tracker.Uses.empty()) { - // If it's trivially not captured, mark it nocapture now. - A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), - CaptureInfo::none())); - ++NumNoCapture; + // If the information is complete, add the attribute now. + A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), NewCI)); + addCapturesStat(NewCI); Changed.insert(F); } else { // If it's not trivially captured and not trivially not captured, // then it must be calling into another function in our SCC. Save // its particulars for Argument-SCC analysis later. ArgumentGraphNode *Node = AG[&A]; + Node->CC = CaptureComponents(NewCI); for (Argument *Use : Tracker.Uses) { Node->Uses.push_back(AG[Use]); if (Use != &A) @@ -1250,12 +1297,8 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, // an SCC? Note that we don't allow any calls at all here, or else our // result will be dependent on the iteration order through the // functions in the SCC. - SmallPtrSet Self; - Self.insert(&A); - Attribute::AttrKind R = determinePointerAccessAttrs(&A, Self); - if (R != Attribute::None) - if (addAccessAttr(&A, R)) - Changed.insert(F); + if (DetermineAccessAttrsForSingleton(&A)) + Changed.insert(F); } if (!SkipInitializes && !A.onlyReadsMemory()) { if (inferInitializes(A, *F)) @@ -1281,17 +1324,17 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (ArgumentSCC[0]->Uses.size() == 1 && ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) { Argument *A = ArgumentSCC[0]->Definition; - A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), - CaptureInfo::none())); - ++NumNoCapture; - Changed.insert(A->getParent()); - - // Infer the access attributes given the new nocapture one - SmallPtrSet Self; - Self.insert(&*A); - Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self); - if (R != Attribute::None) - addAccessAttr(A, R); + CaptureInfo OrigCI = A->getAttributes().getCaptureInfo(); + CaptureInfo NewCI = CaptureInfo(ArgumentSCC[0]->CC) & OrigCI; + if (NewCI != OrigCI) { + A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), NewCI)); + addCapturesStat(NewCI); + Changed.insert(A->getParent()); + } + + // Infer the access attributes given the new captures one + if (DetermineAccessAttrsForSingleton(A)) + Changed.insert(A->getParent()); } continue; } @@ -1303,27 +1346,45 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, ArgumentSCCNodes.insert(I->Definition); } - bool SCCCaptured = false; + // At the SCC level, only track merged CaptureComponents. We're not + // currently prepared to handle propagation of return-only captures across + // the SCC. + CaptureComponents CC = CaptureComponents::None; for (ArgumentGraphNode *N : ArgumentSCC) { for (ArgumentGraphNode *Use : N->Uses) { Argument *A = Use->Definition; - if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A)) - continue; - SCCCaptured = true; + if (ArgumentSCCNodes.count(A)) + CC |= Use->CC; + else + CC |= CaptureComponents(A->getAttributes().getCaptureInfo()); break; } - if (SCCCaptured) + if (capturesAll(CC)) break; } - if (SCCCaptured) - continue; - for (ArgumentGraphNode *N : ArgumentSCC) { - Argument *A = N->Definition; - A->addAttr( - Attribute::getWithCaptureInfo(A->getContext(), CaptureInfo::none())); - ++NumNoCapture; - Changed.insert(A->getParent()); + if (!capturesAll(CC)) { + for (ArgumentGraphNode *N : ArgumentSCC) { + Argument *A = N->Definition; + CaptureInfo OrigCI = A->getAttributes().getCaptureInfo(); + CaptureInfo NewCI = CaptureInfo(N->CC | CC) & OrigCI; + if (NewCI != OrigCI) { + A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), NewCI)); + addCapturesStat(NewCI); + Changed.insert(A->getParent()); + } + } + } + + // TODO(captures): Ignore address-only captures. + if (capturesAnything(CC)) { + // As the pointer may be captured, determine the pointer attributes + // looking at each argument invidivually. + for (ArgumentGraphNode *N : ArgumentSCC) { + if (DetermineAccessAttrsForSingleton(N->Definition)) + Changed.insert(N->Definition->getParent()); + } + continue; } // We also want to compute readonly/readnone/writeonly. With a small number diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp index 2afcdf09af016..639e3039108a7 100644 --- a/llvm/lib/Transforms/IPO/SCCP.cpp +++ b/llvm/lib/Transforms/IPO/SCCP.cpp @@ -191,8 +191,10 @@ static bool runIPSCCP( if (ME == MemoryEffects::unknown()) return AL; - ME |= MemoryEffects(IRMemLocation::Other, - ME.getModRef(IRMemLocation::ArgMem)); + ModRefInfo ArgMemMR = ME.getModRef(IRMemLocation::ArgMem); + ME |= MemoryEffects(IRMemLocation::ErrnoMem, ArgMemMR); + ME |= MemoryEffects(IRMemLocation::Other, ArgMemMR); + return AL.addFnAttribute( F.getContext(), Attribute::getWithMemoryEffects(F.getContext(), ME)); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 0feb6160b68fb..96d6db2ba5bfe 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -882,7 +882,8 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) { void tooManyUses() override { Captured = true; } - bool captured(const Use *U) override { + Action captured(const Use *U, UseCaptureInfo CI) override { + // TODO(captures): Use UseCaptureInfo. auto *ICmp = dyn_cast(U->getUser()); // We need to check that U is based *only* on the alloca, and doesn't // have other contributions from a select/phi operand. @@ -892,11 +893,11 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) { // Collect equality icmps of the alloca, and don't treat them as // captures. ICmps[ICmp] |= 1u << U->getOperandNo(); - return false; + return Continue; } Captured = true; - return true; + return Stop; } }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index c8bdf029dd71c..c7023eb79b04e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -255,6 +255,33 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) { } } + // mul (shr exact X, N), (2^N + 1) -> add (X, shr exact (X, N)) + { + Value *NewOp; + const APInt *ShiftC; + const APInt *MulAP; + if (BitWidth > 2 && + match(&I, m_Mul(m_Exact(m_Shr(m_Value(NewOp), m_APInt(ShiftC))), + m_APInt(MulAP))) && + (*MulAP - 1).isPowerOf2() && *ShiftC == MulAP->logBase2()) { + Value *BinOp = Op0; + BinaryOperator *OpBO = cast(Op0); + + // mul nuw (ashr exact X, N) -> add nuw (X, lshr exact (X, N)) + if (HasNUW && OpBO->getOpcode() == Instruction::AShr && OpBO->hasOneUse()) + BinOp = Builder.CreateLShr(NewOp, ConstantInt::get(Ty, *ShiftC), "", + /*isExact=*/true); + + auto *NewAdd = BinaryOperator::CreateAdd(NewOp, BinOp); + if (HasNSW && (HasNUW || OpBO->getOpcode() == Instruction::LShr || + ShiftC->getZExtValue() < BitWidth - 1)) + NewAdd->setHasNoSignedWrap(true); + + NewAdd->setHasNoUnsignedWrap(HasNUW); + return NewAdd; + } + } + if (Op0->hasOneUse() && match(Op1, m_NegatedPower2())) { // Interpret X * (-1< ClHotPercentileCutoff("hwasan-percentile-cutoff-hot", cl::desc("Hot percentile cutoff.")); static cl::opt - ClRandomSkipRate("hwasan-random-rate", + ClRandomKeepRate("hwasan-random-rate", cl::desc("Probability value in the range [0.0, 1.0] " - "to keep instrumentation of a function.")); + "to keep instrumentation of a function. " + "Note: instrumentation can be skipped randomly " + "OR because of the hot percentile cutoff, if " + "both are supplied.")); STATISTIC(NumTotalFuncs, "Number of total funcs"); STATISTIC(NumInstrumentedFuncs, "Number of instrumented funcs"); @@ -301,7 +304,7 @@ class HWAddressSanitizer { : M(M), SSI(SSI) { this->Recover = optOr(ClRecover, Recover); this->CompileKernel = optOr(ClEnableKhwasan, CompileKernel); - this->Rng = ClRandomSkipRate.getNumOccurrences() ? M.createRNG(DEBUG_TYPE) + this->Rng = ClRandomKeepRate.getNumOccurrences() ? M.createRNG(DEBUG_TYPE) : nullptr; initializeModule(); @@ -1599,9 +1602,9 @@ bool HWAddressSanitizer::selectiveInstrumentationShouldSkip( }; auto SkipRandom = [&]() { - if (!ClRandomSkipRate.getNumOccurrences()) + if (!ClRandomKeepRate.getNumOccurrences()) return false; - std::bernoulli_distribution D(ClRandomSkipRate); + std::bernoulli_distribution D(ClRandomKeepRate); return !D(*Rng); }; diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 60f3893f20a79..8708489ac4fef 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2602,6 +2602,60 @@ struct MemorySanitizerVisitor : public InstVisitor { SC.Done(&I); } + /// Propagate shadow for 1- or 2-vector intrinsics that combine adjacent + /// fields. + /// + /// e.g., <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) + /// <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) + /// + /// TODO: adapt this function to handle horizontal add/sub? + void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I) { + assert(I.arg_size() == 1 || I.arg_size() == 2); + + assert(I.getType()->isVectorTy()); + assert(I.getArgOperand(0)->getType()->isVectorTy()); + + FixedVectorType *ParamType = + cast(I.getArgOperand(0)->getType()); + if (I.arg_size() == 2) + assert(ParamType == cast(I.getArgOperand(1)->getType())); + [[maybe_unused]] FixedVectorType *ReturnType = + cast(I.getType()); + assert(ParamType->getNumElements() * I.arg_size() == + 2 * ReturnType->getNumElements()); + + IRBuilder<> IRB(&I); + unsigned Width = ParamType->getNumElements() * I.arg_size(); + + // Horizontal OR of shadow + SmallVector EvenMask; + SmallVector OddMask; + for (unsigned X = 0; X < Width; X += 2) { + EvenMask.push_back(X); + OddMask.push_back(X + 1); + } + + Value *FirstArgShadow = getShadow(&I, 0); + Value *EvenShadow; + Value *OddShadow; + if (I.arg_size() == 2) { + Value *SecondArgShadow = getShadow(&I, 1); + EvenShadow = + IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, EvenMask); + OddShadow = + IRB.CreateShuffleVector(FirstArgShadow, SecondArgShadow, OddMask); + } else { + EvenShadow = IRB.CreateShuffleVector(FirstArgShadow, EvenMask); + OddShadow = IRB.CreateShuffleVector(FirstArgShadow, OddMask); + } + + Value *OrShadow = IRB.CreateOr(EvenShadow, OddShadow); + OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I)); + + setShadow(&I, OrShadow); + setOriginForNaryOp(I); + } + void visitFNeg(UnaryOperator &I) { handleShadowOr(I); } // Handle multiplication by constant. @@ -3118,7 +3172,31 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } - // Instrument vector convert intrinsic. + /// Handle Arm NEON vector convert intrinsics. + /// + /// e.g., <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) + /// i32 @llvm.aarch64.neon.fcvtms.i32.f64(double) + /// + /// For x86 SSE vector convert intrinsics, see + /// handleSSEVectorConvertIntrinsic(). + void handleNEONVectorConvertIntrinsic(IntrinsicInst &I) { + assert(I.arg_size() == 1); + + IRBuilder<> IRB(&I); + Value *S0 = getShadow(&I, 0); + + /// For scalars: + /// Since they are converting from floating-point to integer, the output is + /// - fully uninitialized if *any* bit of the input is uninitialized + /// - fully ininitialized if all bits of the input are ininitialized + /// We apply the same principle on a per-field basis for vectors. + Value *OutShadow = IRB.CreateSExt(IRB.CreateICmpNE(S0, getCleanShadow(S0)), + getShadowTy(&I)); + setShadow(&I, OutShadow); + setOriginForNaryOp(I); + } + + // Instrument x86 SSE vector convert intrinsic. // // This function instruments intrinsics like cvtsi2ss: // %Out = int_xxx_cvtyyy(%ConvertOp) @@ -3133,8 +3211,11 @@ struct MemorySanitizerVisitor : public InstVisitor { // We copy the shadow of \p CopyOp[NumUsedElements:] to \p // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always // return a fully initialized value. - void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements, - bool HasRoundingMode = false) { + // + // For Arm NEON vector convert intrinsics, see + // handleNEONVectorConvertIntrinsic(). + void handleSSEVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements, + bool HasRoundingMode = false) { IRBuilder<> IRB(&I); Value *CopyOp, *ConvertOp; @@ -4423,7 +4504,7 @@ struct MemorySanitizerVisitor : public InstVisitor { case Intrinsic::x86_avx512_cvtusi2ss: case Intrinsic::x86_avx512_cvtusi642sd: case Intrinsic::x86_avx512_cvtusi642ss: - handleVectorConvertIntrinsic(I, 1, true); + handleSSEVectorConvertIntrinsic(I, 1, true); break; case Intrinsic::x86_sse2_cvtsd2si64: case Intrinsic::x86_sse2_cvtsd2si: @@ -4434,11 +4515,11 @@ struct MemorySanitizerVisitor : public InstVisitor { case Intrinsic::x86_sse_cvtss2si: case Intrinsic::x86_sse_cvttss2si64: case Intrinsic::x86_sse_cvttss2si: - handleVectorConvertIntrinsic(I, 1); + handleSSEVectorConvertIntrinsic(I, 1); break; case Intrinsic::x86_sse_cvtps2pi: case Intrinsic::x86_sse_cvttps2pi: - handleVectorConvertIntrinsic(I, 2); + handleSSEVectorConvertIntrinsic(I, 2); break; case Intrinsic::x86_avx512_psll_w_512: @@ -4781,6 +4862,48 @@ struct MemorySanitizerVisitor : public InstVisitor { setOrigin(&I, getCleanOrigin()); break; + // Add Pairwise + case Intrinsic::aarch64_neon_addp: + // Floating-point Add Pairwise + case Intrinsic::aarch64_neon_faddp: + // Add Long Pairwise + case Intrinsic::aarch64_neon_saddlp: + case Intrinsic::aarch64_neon_uaddlp: { + handlePairwiseShadowOrIntrinsic(I); + break; + } + + // Floating-point Convert to integer, rounding to nearest with ties to Away + case Intrinsic::aarch64_neon_fcvtas: + case Intrinsic::aarch64_neon_fcvtau: + // Floating-point convert to integer, rounding toward minus infinity + case Intrinsic::aarch64_neon_fcvtms: + case Intrinsic::aarch64_neon_fcvtmu: + // Floating-point convert to integer, rounding to nearest with ties to even + case Intrinsic::aarch64_neon_fcvtns: + case Intrinsic::aarch64_neon_fcvtnu: + // Floating-point convert to integer, rounding toward plus infinity + case Intrinsic::aarch64_neon_fcvtps: + case Intrinsic::aarch64_neon_fcvtpu: + // Floating-point Convert to integer, rounding toward Zero + case Intrinsic::aarch64_neon_fcvtzs: + case Intrinsic::aarch64_neon_fcvtzu: + // Floating-point convert to lower precision narrow, rounding to odd + case Intrinsic::aarch64_neon_fcvtxn: { + handleNEONVectorConvertIntrinsic(I); + break; + } + + // Saturating extract narrow + case Intrinsic::aarch64_neon_sqxtn: + case Intrinsic::aarch64_neon_sqxtun: + case Intrinsic::aarch64_neon_uqxtn: + // These only have one argument, but we (ab)use handleShadowOr because it + // does work on single argument intrinsics and will typecast the shadow + // (and update the origin). + handleShadowOr(I); + break; + case Intrinsic::aarch64_neon_st1x2: case Intrinsic::aarch64_neon_st1x3: case Intrinsic::aarch64_neon_st1x4: @@ -4832,6 +4955,12 @@ struct MemorySanitizerVisitor : public InstVisitor { break; } + case Intrinsic::scmp: + case Intrinsic::ucmp: { + handleShadowOr(I); + break; + } + default: if (!handleUnknownIntrinsic(I)) visitInstruction(I); diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 46b0783004fcd..eaf89b23c26f7 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1099,7 +1099,7 @@ struct LoopFuser { LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n"); for (Instruction *NotHoistedInst : NotHoisting) { - if (auto D = DI.depends(&I, NotHoistedInst, true)) { + if (auto D = DI.depends(&I, NotHoistedInst)) { // Dependency is not read-before-write, write-before-read or // write-before-write if (D->isFlow() || D->isAnti() || D->isOutput()) { @@ -1111,7 +1111,7 @@ struct LoopFuser { } for (Instruction *ReadInst : FC0.MemReads) { - if (auto D = DI.depends(ReadInst, &I, true)) { + if (auto D = DI.depends(ReadInst, &I)) { // Dependency is not read-before-write if (D->isAnti()) { LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n"); @@ -1121,7 +1121,7 @@ struct LoopFuser { } for (Instruction *WriteInst : FC0.MemWrites) { - if (auto D = DI.depends(WriteInst, &I, true)) { + if (auto D = DI.depends(WriteInst, &I)) { // Dependency is not write-before-read or write-before-write if (D->isFlow() || D->isOutput()) { LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n"); @@ -1153,7 +1153,7 @@ struct LoopFuser { return true; for (Instruction *ReadInst : FC1.MemReads) { - if (auto D = DI.depends(&I, ReadInst, true)) { + if (auto D = DI.depends(&I, ReadInst)) { // Dependency is not write-before-read if (D->isFlow()) { LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n"); @@ -1163,7 +1163,7 @@ struct LoopFuser { } for (Instruction *WriteInst : FC1.MemWrites) { - if (auto D = DI.depends(&I, WriteInst, true)) { + if (auto D = DI.depends(&I, WriteInst)) { // Dependency is not write-before-write or read-before-write if (D->isOutput() || D->isAnti()) { LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n"); @@ -1335,7 +1335,7 @@ struct LoopFuser { case FUSION_DEPENDENCE_ANALYSIS_SCEV: return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); case FUSION_DEPENDENCE_ANALYSIS_DA: { - auto DepResult = DI.depends(&I0, &I1, true); + auto DepResult = DI.depends(&I0, &I1); if (!DepResult) return true; #ifndef NDEBUG diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index d88fdf41db7a8..f45d90ff13e14 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -146,7 +146,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, if (isa(Src) && isa(Dst)) continue; // Track Output, Flow, and Anti dependencies. - if (auto D = DI->depends(Src, Dst, true)) { + if (auto D = DI->depends(Src, Dst)) { assert(D->isOrdered() && "Expected an output, flow or anti dep."); // If the direction vector is negative, normalize it to // make it non-negative. diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 87b27beb01a0a..9a729b7afb8b9 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1550,32 +1550,33 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, } if (!Visited.insert(&U).second) continue; - switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { - case UseCaptureKind::MAY_CAPTURE: - return false; - case UseCaptureKind::PASSTHROUGH: - // Instructions cannot have non-instruction users. + UseCaptureInfo CI = + DetermineUseCaptureKind(U, AI, IsDereferenceableOrNull); + // TODO(captures): Make this more precise. + if (CI.isPassthrough()) { Worklist.push_back(UI); continue; - case UseCaptureKind::NO_CAPTURE: { - if (UI->isLifetimeStartOrEnd()) { - // We note the locations of these intrinsic calls so that we can - // delete them later if the optimization succeeds, this is safe - // since both llvm.lifetime.start and llvm.lifetime.end intrinsics - // practically fill all the bytes of the alloca with an undefined - // value, although conceptually marked as alive/dead. - int64_t Size = cast(UI->getOperand(0))->getSExtValue(); - if (Size < 0 || Size == DestSize) { - LifetimeMarkers.push_back(UI); - continue; - } - } - if (UI->hasMetadata(LLVMContext::MD_noalias)) - NoAliasInstrs.insert(UI); - if (!ModRefCallback(UI)) - return false; } + + if (capturesAnything(CI)) + return false; + + if (UI->isLifetimeStartOrEnd()) { + // We note the locations of these intrinsic calls so that we can + // delete them later if the optimization succeeds, this is safe + // since both llvm.lifetime.start and llvm.lifetime.end intrinsics + // practically fill all the bytes of the alloca with an undefined + // value, although conceptually marked as alive/dead. + int64_t Size = cast(UI->getOperand(0))->getSExtValue(); + if (Size < 0 || Size == DestSize) { + LifetimeMarkers.push_back(UI); + continue; + } } + if (UI->hasMetadata(LLVMContext::MD_noalias)) + NoAliasInstrs.insert(UI); + if (!ModRefCallback(UI)) + return false; } } return true; diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 29240aaaa21be..e88c130cccf20 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5154,7 +5154,7 @@ insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr, return; DIB.insertDeclare(NewAddr, Orig->getVariable(), NewAddrExpr, - Orig->getDebugLoc(), BeforeInst); + Orig->getDebugLoc(), BeforeInst->getIterator()); } /// Insert a new dbg.assign. diff --git a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp index b0105ae8fa116..ecf71b6056f2a 100644 --- a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp +++ b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp @@ -398,7 +398,7 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint, // Check if I has any output/flow/anti dependences with instructions from \p // StartInst to \p EndInst. if (llvm::any_of(InstsToCheck, [&DI, &I](Instruction *CurInst) { - auto DepResult = DI->depends(&I, CurInst, true); + auto DepResult = DI->depends(&I, CurInst); if (DepResult && (DepResult->isOutput() || DepResult->isFlow() || DepResult->isAnti())) return true; diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index e5e2aa6556930..e47a6ce6e9205 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -127,7 +127,7 @@ bool llvm::applyDebugifyMetadata( // Helper that inserts a dbg.value before \p InsertBefore, copying the // location (and possibly the type, if it's non-void) from \p TemplateInst. auto insertDbgVal = [&](Instruction &TemplateInst, - Instruction *InsertBefore) { + BasicBlock::iterator InsertPt) { std::string Name = utostr(NextVar++); Value *V = &TemplateInst; if (TemplateInst.getType()->isVoidTy()) @@ -137,7 +137,7 @@ bool llvm::applyDebugifyMetadata( getCachedDIType(V->getType()), /*AlwaysPreserve=*/true); DIB.insertDbgValueIntrinsic(V, LocalVar, DIB.createExpression(), Loc, - InsertBefore); + InsertPt); }; for (BasicBlock &BB : F) { @@ -161,7 +161,9 @@ bool llvm::applyDebugifyMetadata( // are made. BasicBlock::iterator InsertPt = BB.getFirstInsertionPt(); assert(InsertPt != BB.end() && "Expected to find an insertion point"); - Instruction *InsertBefore = &*InsertPt; + + // Insert after existing debug values to preserve order. + InsertPt.setHeadBit(false); // Attach debug values. for (Instruction *I = &*BB.begin(); I != LastInst; I = I->getNextNode()) { @@ -172,9 +174,9 @@ bool llvm::applyDebugifyMetadata( // Phis and EH pads must be grouped at the beginning of the block. // Only advance the insertion point when we finish visiting these. if (!isa(I) && !I->isEHPad()) - InsertBefore = I->getNextNode(); + InsertPt = std::next(I->getIterator()); - insertDbgVal(*I, InsertBefore); + insertDbgVal(*I, InsertPt); InsertedDbgVal = true; } } @@ -185,7 +187,7 @@ bool llvm::applyDebugifyMetadata( // those tests, and this helps with that.) if (DebugifyLevel == Level::LocationsAndVariables && !InsertedDbgVal) { auto *Term = findTerminatingInstruction(F.getEntryBlock()); - insertDbgVal(*Term, Term); + insertDbgVal(*Term, Term->getIterator()); } if (ApplyToMF) ApplyToMF(DIB, F); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 2c6328300738f..6d7c710020c3e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1693,9 +1693,7 @@ static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV, const DebugLoc &NewLoc, BasicBlock::iterator Instr) { if (!UseNewDbgInfoFormat) { - auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, - (Instruction *)nullptr); - cast(DbgVal)->insertBefore(Instr); + Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, Instr); } else { // RemoveDIs: if we're using the new debug-info format, allocate a // DbgVariableRecord directly instead of a dbg.value intrinsic. @@ -1708,19 +1706,10 @@ static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV, static void insertDbgValueOrDbgVariableRecordAfter( DIBuilder &Builder, Value *DV, DILocalVariable *DIVar, DIExpression *DIExpr, - const DebugLoc &NewLoc, BasicBlock::iterator Instr) { - if (!UseNewDbgInfoFormat) { - auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, - (Instruction *)nullptr); - cast(DbgVal)->insertAfter(Instr); - } else { - // RemoveDIs: if we're using the new debug-info format, allocate a - // DbgVariableRecord directly instead of a dbg.value intrinsic. - ValueAsMetadata *DVAM = ValueAsMetadata::get(DV); - DbgVariableRecord *DV = - new DbgVariableRecord(DVAM, DIVar, DIExpr, NewLoc.get()); - Instr->getParent()->insertDbgRecordAfter(DV, &*Instr); - } + const DebugLoc &NewLoc, Instruction *Instr) { + BasicBlock::iterator NextIt = std::next(Instr->getIterator()); + NextIt.setHeadBit(true); + insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, NextIt); } /// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value @@ -1812,7 +1801,7 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, // preferable to keep tracking both the loaded value and the original // address in case the alloca can not be elided. insertDbgValueOrDbgVariableRecordAfter(Builder, LI, DIVar, DIExpr, NewLoc, - LI->getIterator()); + LI); } void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp index 0b532b68e3721..89cfee2acd82c 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -709,7 +709,7 @@ static bool checkDependency(Instruction *Src, Instruction *Dst, // (0,0,>=,*,*) // Now, the dependency is not necessarily non-negative anymore, i.e. // unroll-and-jam may violate correctness. - std::unique_ptr D = DI.depends(Src, Dst, true); + std::unique_ptr D = DI.depends(Src, Dst); if (!D) return true; assert(D->isOrdered() && "Expected an output, flow or anti dep."); diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 016186cb6b09d..05fd989271c32 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -119,7 +119,8 @@ static void createDebugValue(DIBuilder &DIB, Value *NewValue, DILocalVariable *Variable, DIExpression *Expression, const DILocation *DI, Instruction *InsertBefore) { - DIB.insertDbgValueIntrinsic(NewValue, Variable, Expression, DI, InsertBefore); + DIB.insertDbgValueIntrinsic(NewValue, Variable, Expression, DI, + InsertBefore->getIterator()); } /// Helper for updating assignment tracking debug info when promoting allocas. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f2241be60ce05..584cda34f902e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5260,6 +5260,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; + collectInLoopReductions(); + for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) { Instruction *I = IdxToInstr[Idx]; @@ -5276,8 +5278,6 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { if (ValuesToIgnore.count(I)) continue; - collectInLoopReductions(); - // For each VF find the maximum usage of registers. for (unsigned J = 0, E = VFs.size(); J < E; ++J) { // Count the number of registers used, per register class, given all open @@ -7008,6 +7008,10 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { } void LoopVectorizationCostModel::collectInLoopReductions() { + // Avoid duplicating work finding in-loop reductions. + if (!InLoopReductions.empty()) + return; + for (const auto &Reduction : Legal->getReductionVars()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = Reduction.second; @@ -8800,6 +8804,10 @@ bool VPRecipeBuilder::getScaledReductions( return false; using namespace llvm::PatternMatch; + // Use the side-effect of match to replace BinOp only if the pattern is + // matched, we don't care at this point whether it actually matched. + match(BinOp, m_Neg(m_BinOp(BinOp))); + Value *A, *B; if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) || !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B)))) @@ -8932,6 +8940,19 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, std::swap(BinOp, Accumulator); unsigned ReductionOpcode = Reduction->getOpcode(); + if (ReductionOpcode == Instruction::Sub) { + VPBasicBlock *ParentBlock = Builder.getInsertBlock(); + assert(ParentBlock && "Builder must have an insert block."); + + auto *const Zero = ConstantInt::get(Reduction->getType(), 0); + SmallVector Ops; + Ops.push_back(Plan.getOrAddLiveIn(Zero)); + Ops.push_back(BinOp); + BinOp = new VPWidenRecipe(*Reduction, make_range(Ops.begin(), Ops.end())); + ParentBlock->appendRecipe(BinOp->getDefiningRecipe()); + ReductionOpcode = Instruction::Add; + } + if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) { assert((ReductionOpcode == Instruction::Add || ReductionOpcode == Instruction::Sub) && diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e1c08077126db..c72d3579e1aa3 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3997,23 +3997,29 @@ class BoUpSLP { return Sum; } - void dump(raw_ostream &os) const { - if (!isSchedulingEntity()) { - os << "/ " << *Inst; - } else if (NextInBundle) { - os << '[' << *Inst; + void dump(raw_ostream &OS) const { + if (isPartOfBundle()) { + if (!isSchedulingEntity()) { + OS << "/ " << *Inst << ", part of "; + FirstInBundle->dump(OS); + return; + } + OS << '[' << *Inst; ScheduleData *SD = NextInBundle; while (SD) { - os << ';' << *SD->Inst; + OS << ';' << *SD->Inst; SD = SD->NextInBundle; } - os << ']'; + OS << ']'; } else { - os << *Inst; + OS << *Inst; } } - LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } + LLVM_DUMP_METHOD void dump() const { + dump(dbgs()); + dbgs() << '\n'; + } Instruction *Inst = nullptr; @@ -5596,6 +5602,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { ::getNumberOfParts(*TTI, getWidenedType(TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1) return std::nullopt; + if (TE.ReuseShuffleIndices.size() % Sz != 0) + return std::nullopt; if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, Sz)) { SmallVector ReorderMask(Sz, PoisonMaskElem); @@ -5626,7 +5634,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { UsedVals.set(Val); for (unsigned K = 0; K < NumParts; ++K) { unsigned Idx = Val + Sz * K; - if (Idx < VF) + if (Idx < VF && I + K < VF) ResOrder[Idx] = I + K; } } diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 06a5e3bed7f03..098b296c30ab8 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -122,6 +122,8 @@ MemDGNodeIntervalBuilder::getBotMemDGNode(const Interval &Intvl, Interval MemDGNodeIntervalBuilder::make(const Interval &Instrs, DependencyGraph &DAG) { + if (Instrs.empty()) + return {}; auto *TopMemN = getTopMemDGNode(Instrs, DAG); // If we couldn't find a mem node in range TopN - BotN then it's empty. if (TopMemN == nullptr) @@ -529,8 +531,8 @@ Interval DependencyGraph::extend(ArrayRef Instrs) { } } }; - if (DAGInterval.empty()) { - assert(NewInterval == InstrsInterval && "Expected empty DAGInterval!"); + auto MemDAGInterval = MemDGNodeIntervalBuilder::make(DAGInterval, *this); + if (MemDAGInterval.empty()) { FullScan(NewInterval); } // 2. The new section is below the old section. @@ -550,8 +552,7 @@ Interval DependencyGraph::extend(ArrayRef Instrs) { // range including both NewInterval and DAGInterval until DstN, for each DstN. else if (DAGInterval.bottom()->comesBefore(NewInterval.top())) { auto DstRange = MemDGNodeIntervalBuilder::make(NewInterval, *this); - auto SrcRangeFull = MemDGNodeIntervalBuilder::make( - DAGInterval.getUnionInterval(NewInterval), *this); + auto SrcRangeFull = MemDAGInterval.getUnionInterval(DstRange); for (MemDGNode &DstN : DstRange) { auto SrcRange = Interval(SrcRangeFull.top(), DstN.getPrevNode()); @@ -589,7 +590,7 @@ Interval DependencyGraph::extend(ArrayRef Instrs) { // When scanning for deps with destination in DAGInterval we need to // consider sources from the NewInterval only, because all intra-DAGInterval // dependencies have already been created. - auto DstRangeOld = MemDGNodeIntervalBuilder::make(DAGInterval, *this); + auto DstRangeOld = MemDAGInterval; auto SrcRange = MemDGNodeIntervalBuilder::make(NewInterval, *this); for (MemDGNode &DstN : DstRangeOld) scanAndAddDeps(DstN, SrcRange); diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp index c9329c24e1f4c..74634372156aa 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp @@ -202,11 +202,14 @@ CollectDescr LegalityAnalysis::getHowToCollectValues(ArrayRef Bndl) const { SmallVector Vec; Vec.reserve(Bndl.size()); - for (auto [Lane, V] : enumerate(Bndl)) { + for (auto [Elm, V] : enumerate(Bndl)) { if (auto *VecOp = IMaps.getVectorForOrig(V)) { // If there is a vector containing `V`, then get the lane it came from. std::optional ExtractIdxOpt = IMaps.getOrigLane(VecOp, V); - Vec.emplace_back(VecOp, ExtractIdxOpt ? *ExtractIdxOpt : -1); + // This could be a vector, like <2 x float> in which case the mask needs + // to enumerate all lanes. + for (unsigned Ln = 0, Lanes = VecUtils::getNumLanes(V); Ln != Lanes; ++Ln) + Vec.emplace_back(VecOp, ExtractIdxOpt ? *ExtractIdxOpt + Ln : -1); } else { Vec.emplace_back(V); } diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 507d163240127..0ccef5aecd28b 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -328,12 +328,14 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, const ShuffleMask &Mask = cast(LegalityRes).getMask(); NewVec = createShuffle(VecOp, Mask, UserBB); + assert(NewVec->getType() == VecOp->getType() && + "Expected same type! Bad mask ?"); break; } case LegalityResultID::DiamondReuseMultiInput: { const auto &Descr = cast(LegalityRes).getCollectDescr(); - Type *ResTy = FixedVectorType::get(Bndl[0]->getType(), Bndl.size()); + Type *ResTy = VecUtils::getWideType(Bndl[0]->getType(), Bndl.size()); // TODO: Try to get WhereIt without creating a vector. SmallVector DescrInstrs; @@ -345,7 +347,8 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, getInsertPointAfterInstrs(DescrInstrs, UserBB); Value *LastV = PoisonValue::get(ResTy); - for (auto [Lane, ElmDescr] : enumerate(Descr.getDescrs())) { + unsigned Lane = 0; + for (const auto &ElmDescr : Descr.getDescrs()) { Value *VecOp = ElmDescr.getValue(); Context &Ctx = VecOp->getContext(); Value *ValueToInsert; @@ -357,10 +360,32 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl, } else { ValueToInsert = VecOp; } - ConstantInt *LaneC = ConstantInt::get(Type::getInt32Ty(Ctx), Lane); - Value *Ins = InsertElementInst::create(LastV, ValueToInsert, LaneC, - WhereIt, Ctx, "VIns"); - LastV = Ins; + auto NumLanesToInsert = VecUtils::getNumLanes(ValueToInsert); + if (NumLanesToInsert == 1) { + // If we are inserting a scalar element then we need a single insert. + // %VIns = insert %DstVec, %SrcScalar, Lane + ConstantInt *LaneC = ConstantInt::get(Type::getInt32Ty(Ctx), Lane); + LastV = InsertElementInst::create(LastV, ValueToInsert, LaneC, WhereIt, + Ctx, "VIns"); + } else { + // If we are inserting a vector element then we need to extract and + // insert each vector element one by one with a chain of extracts and + // inserts, for example: + // %VExt0 = extract %SrcVec, 0 + // %VIns0 = insert %DstVec, %Vect0, Lane + 0 + // %VExt1 = extract %SrcVec, 1 + // %VIns1 = insert %VIns0, %Vect0, Lane + 1 + for (unsigned LnCnt = 0; LnCnt != NumLanesToInsert; ++LnCnt) { + auto *ExtrIdxC = ConstantInt::get(Type::getInt32Ty(Ctx), LnCnt); + auto *ExtrI = ExtractElementInst::create(ValueToInsert, ExtrIdxC, + WhereIt, Ctx, "VExt"); + unsigned InsLane = Lane + LnCnt; + auto *InsLaneC = ConstantInt::get(Type::getInt32Ty(Ctx), InsLane); + LastV = InsertElementInst::create(LastV, ExtrI, InsLaneC, WhereIt, + Ctx, "VIns"); + } + } + Lane += NumLanesToInsert; } NewVec = LastV; break; diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp index dd24cc3d98cf8..2f7d7087ca880 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp @@ -230,11 +230,13 @@ bool Scheduler::trySchedule(ArrayRef Instrs) { // top-most part of the schedule that includes the instrs in the bundle and // re-schedule. trimSchedule(Instrs); + ScheduleTopItOpt = std::nullopt; [[fallthrough]]; case BndlSchedState::NoneScheduled: { // TODO: Set the window of the DAG that we are interested in. - // We start scheduling at the bottom instr of Instrs. - ScheduleTopItOpt = std::next(VecUtils::getLowest(Instrs)->getIterator()); + if (!ScheduleTopItOpt) + // We start scheduling at the bottom instr of Instrs. + ScheduleTopItOpt = std::next(VecUtils::getLowest(Instrs)->getIterator()); // TODO: For now don't cross BBs. if (!DAG.getInterval().empty()) { @@ -262,6 +264,12 @@ bool Scheduler::trySchedule(ArrayRef Instrs) { void Scheduler::dump(raw_ostream &OS) const { OS << "ReadyList:\n"; ReadyList.dump(OS); + OS << "Top of schedule: "; + if (ScheduleTopItOpt) + OS << **ScheduleTopItOpt; + else + OS << "Empty"; + OS << "\n"; } void Scheduler::dump() const { dump(dbgs()); } #endif // NDEBUG diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 3816e1b61576a..fbbc466f2f7f6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3709,18 +3709,16 @@ class VPlan { /// yet) for \p V. VPValue *getOrAddLiveIn(Value *V) { assert(V && "Trying to get or add the VPValue of a null Value"); - if (!Value2VPValue.count(V)) { + auto [It, Inserted] = Value2VPValue.try_emplace(V); + if (Inserted) { VPValue *VPV = new VPValue(V); VPLiveInsToFree.push_back(VPV); assert(VPV->isLiveIn() && "VPV must be a live-in."); - assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); - Value2VPValue[V] = VPV; + It->second = VPV; } - assert(Value2VPValue.count(V) && "Value does not exist in VPlan"); - assert(Value2VPValue[V]->isLiveIn() && - "Only live-ins should be in mapping"); - return Value2VPValue[V]; + assert(It->second->isLiveIn() && "Only live-ins should be in mapping"); + return It->second; } /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1855fb67aa54f..f5d5e12b1c85d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/VectorBuilder.h" @@ -284,13 +285,18 @@ InstructionCost VPPartialReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { std::optional Opcode = std::nullopt; - VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe(); + VPValue *BinOp = getOperand(0); // If the partial reduction is predicated, a select will be operand 0 rather // than the binary op using namespace llvm::VPlanPatternMatch; if (match(getOperand(0), m_Select(m_VPValue(), m_VPValue(), m_VPValue()))) - BinOpR = BinOpR->getOperand(1)->getDefiningRecipe(); + BinOp = BinOp->getDefiningRecipe()->getOperand(1); + + // If BinOp is a negation, use the side effect of match to assign the actual + // binary operation to BinOp + match(BinOp, m_Binary(m_SpecificInt(0), m_VPValue(BinOp))); + VPRecipeBase *BinOpR = BinOp->getDefiningRecipe(); if (auto *WidenR = dyn_cast(BinOpR)) Opcode = std::make_optional(WidenR->getOpcode()); diff --git a/llvm/test/Analysis/BasicAA/fallback-mayalias.ll b/llvm/test/Analysis/BasicAA/fallback-mayalias.ll index 861351871f818..52eb494c8d75e 100644 --- a/llvm/test/Analysis/BasicAA/fallback-mayalias.ll +++ b/llvm/test/Analysis/BasicAA/fallback-mayalias.ll @@ -3,7 +3,7 @@ ; Check that BasicAA falls back to MayAlias (instead of PartialAlias) when none ; of its little tricks are applicable. -; CHECK: MayAlias: float* %arrayidxA, float* %arrayidxB +; CHECK: NoAlias: float* %arrayidxA, float* %arrayidxB define void @fallback_mayalias(ptr noalias nocapture %C, i64 %i, i64 %j) local_unnamed_addr { entry: diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll index 7a055c7152ec8..a0c06083c270b 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll @@ -23,11 +23,12 @@ entry: attributes #0 = { "omp_target_num_teams"="100" "omp_target_thread_limit"="101" + "nvvm.maxclusterrank"="200" } !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} -!nvvm.annotations = !{!6, !7, !8, !9, !10} +!nvvm.annotations = !{!7, !8, !9, !10} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) @@ -35,7 +36,6 @@ attributes #0 = { !3 = !{} !4 = !DISubroutineType(types: !3) !5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) -!6 = !{ptr @test, !"maxclusterrank", i32 200} !7 = !{ptr @test, !"maxntidx", i32 210} !8 = !{ptr @test, !"maxntidy", i32 211} !9 = !{ptr @test, !"maxntidz", i32 212} diff --git a/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll index cb50b2c75ccb7..20fc0bea43b71 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll @@ -1,10 +1,11 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes='print' -disable-output < %s 2>&1 | FileCheck %s ; This is the test case from PR26314. -; When we were retrying dependence checking with memchecks only, -; the loop-invariant access in the inner loop was incorrectly determined to be wrapping +; When we were retrying dependence checking with memchecks only, +; the loop-invariant access in the inner loop was incorrectly determined to be wrapping ; because it was not strided in the inner loop. - + ; #define Z 32 ; typedef struct s { ; int v1[Z]; @@ -21,19 +22,52 @@ ; } ; } -; CHECK: function 'Test': -; CHECK: .inner: -; CHECK-NEXT: Memory dependences are safe with run-time checks -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK: Check 0: -; CHECK: Check 1: - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] } define void @Test(ptr nocapture %obj, i64 %z) #0 { +; CHECK-LABEL: 'Test' +; CHECK-NEXT: .inner: +; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]): +; CHECK-NEXT: %6 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 2, i64 %i, i64 %j +; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]): +; CHECK-NEXT: %2 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 0, i64 %j +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[GRP1]]): +; CHECK-NEXT: %6 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 2, i64 %i, i64 %j +; CHECK-NEXT: Against group ([[GRP3:0x[0-9a-f]+]]): +; CHECK-NEXT: %1 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 1, i64 %i +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP1]]: +; CHECK-NEXT: (Low: {(256 + %obj),+,128}<%.outer.preheader> High: {(256 + (4 * %z) + %obj),+,128}<%.outer.preheader>) +; CHECK-NEXT: Member: {{\{\{}}(256 + %obj),+,128}<%.outer.preheader>,+,4}<%.inner> +; CHECK-NEXT: Group [[GRP2]]: +; CHECK-NEXT: (Low: %obj High: ((4 * %z) + %obj)) +; CHECK-NEXT: Member: {%obj,+,4}<%.inner> +; CHECK-NEXT: Group [[GRP3]]: +; CHECK-NEXT: (Low: {(128 + %obj),+,4}<%.outer.preheader> High: {(132 + %obj),+,4}<%.outer.preheader>) +; CHECK-NEXT: Member: {(128 + %obj),+,4}<%.outer.preheader> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: .outer.preheader: +; CHECK-NEXT: Report: loop is not the innermost loop +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; br label %.outer.preheader @@ -44,7 +78,7 @@ define void @Test(ptr nocapture %obj, i64 %z) #0 { .exit: ret void - + .outer: %i.next = add nuw nsw i64 %i, 1 %exitcond.outer = icmp eq i64 %i.next, 32 @@ -59,7 +93,7 @@ define void @Test(ptr nocapture %obj, i64 %z) #0 { %6 = getelementptr inbounds %struct.s, ptr %obj, i64 0, i32 2, i64 %i, i64 %j %7 = load i32, ptr %6 %8 = add nsw i32 %5, %7 - store i32 %8, ptr %6 + store i32 %8, ptr %6 %j.next = add nuw nsw i64 %j, 1 %exitcond.inner = icmp eq i64 %j.next, %z br i1 %exitcond.inner, label %.outer, label %.inner diff --git a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-distance-backward.ll b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-distance-backward.ll index 0058135a30d67..f939680aa279d 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-distance-backward.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-distance-backward.ll @@ -1,7 +1,12 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes='print' -disable-output %s 2>&1 | FileCheck --check-prefixes=COMMON,MAXLEN %s -; RUN: opt -passes='print' -disable-output -mtriple=arm64-apple-macosx %s 2>&1 | FileCheck --check-prefixes=COMMON,VW128 %s -; RUN: opt -passes='print' -disable-output -mtriple=arm64-apple-macosx -mattr=+sve %s 2>&1 | FileCheck --check-prefixes=COMMON,MAXLEN %s +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print' -disable-output %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=COMMON,CHECK +; RUN: opt -passes='print' -disable-output \ +; RUN: -mtriple=arm64-apple-macosx %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=COMMON,VW128 +; RUN: opt -passes='print' -disable-output \ +; RUN: -mtriple=arm64-apple-macosx -mattr=+sve %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=COMMON,MAXLEN ; REQUIRES: aarch64-registered-target @@ -72,38 +77,97 @@ exit: } define void @backward_min_distance_120(ptr %A, i64 %N) { -; COMMON-LABEL: 'backward_min_distance_120' -; COMMON-NEXT: loop: -; COMMON-NEXT: Memory dependences are safe with run-time checks -; COMMON-NEXT: Dependences: -; COMMON-NEXT: Run-time memory checks: -; COMMON-NEXT: Check 0: -; COMMON-NEXT: Comparing group ([[GRP3:0x[0-9a-f]+]]): -; COMMON-NEXT: %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv -; COMMON-NEXT: Against group ([[GRP4:0x[0-9a-f]+]]): -; COMMON-NEXT: %gep = getelementptr inbounds i8, ptr %A, i64 %iv -; COMMON-NEXT: Grouped accesses: -; COMMON-NEXT: Group [[GRP3]]: -; COMMON-NEXT: (Low: {(15 + %A),+,1}<%outer.header> High: {(271 + %A),+,1}<%outer.header>) -; COMMON-NEXT: Member: {{\{\{}}(15 + %A),+,1}<%outer.header>,+,1}<%loop> -; COMMON-NEXT: Group [[GRP4]]: -; COMMON-NEXT: (Low: %A High: (256 + %A)) -; COMMON-NEXT: Member: {%A,+,1}<%loop> -; COMMON-EMPTY: -; COMMON-NEXT: Non vectorizable stores to invariant address were not found in loop. -; COMMON-NEXT: SCEV assumptions: -; COMMON-EMPTY: -; COMMON-NEXT: Expressions re-written: -; COMMON-NEXT: outer.header: -; COMMON-NEXT: Report: loop is not the innermost loop -; COMMON-NEXT: Dependences: -; COMMON-NEXT: Run-time memory checks: -; COMMON-NEXT: Grouped accesses: -; COMMON-EMPTY: -; COMMON-NEXT: Non vectorizable stores to invariant address were not found in loop. -; COMMON-NEXT: SCEV assumptions: -; COMMON-EMPTY: -; COMMON-NEXT: Expressions re-written: +; CHECK-LABEL: 'backward_min_distance_120' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 120 bits +; CHECK-NEXT: Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: %l = load i8, ptr %gep, align 4 -> +; CHECK-NEXT: store i8 %add, ptr %gep.off.iv, align 4 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: outer.header: +; CHECK-NEXT: Report: loop is not the innermost loop +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +; VW128-LABEL: 'backward_min_distance_120' +; VW128-NEXT: loop: +; VW128-NEXT: Memory dependences are safe with run-time checks +; VW128-NEXT: Dependences: +; VW128-NEXT: Run-time memory checks: +; VW128-NEXT: Check 0: +; VW128-NEXT: Comparing group ([[GRP7:0x[0-9a-f]+]]): +; VW128-NEXT: %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv +; VW128-NEXT: Against group ([[GRP8:0x[0-9a-f]+]]): +; VW128-NEXT: %gep = getelementptr inbounds i8, ptr %A, i64 %iv +; VW128-NEXT: Grouped accesses: +; VW128-NEXT: Group [[GRP7]]: +; VW128-NEXT: (Low: {(15 + %A),+,1}<%outer.header> High: {(271 + %A),+,1}<%outer.header>) +; VW128-NEXT: Member: {{\{\{}}(15 + %A),+,1}<%outer.header>,+,1}<%loop> +; VW128-NEXT: Group [[GRP8]]: +; VW128-NEXT: (Low: %A High: (256 + %A)) +; VW128-NEXT: Member: {%A,+,1}<%loop> +; VW128-EMPTY: +; VW128-NEXT: Non vectorizable stores to invariant address were not found in loop. +; VW128-NEXT: SCEV assumptions: +; VW128-EMPTY: +; VW128-NEXT: Expressions re-written: +; VW128-NEXT: outer.header: +; VW128-NEXT: Report: loop is not the innermost loop +; VW128-NEXT: Dependences: +; VW128-NEXT: Run-time memory checks: +; VW128-NEXT: Grouped accesses: +; VW128-EMPTY: +; VW128-NEXT: Non vectorizable stores to invariant address were not found in loop. +; VW128-NEXT: SCEV assumptions: +; VW128-EMPTY: +; VW128-NEXT: Expressions re-written: +; +; MAXLEN-LABEL: 'backward_min_distance_120' +; MAXLEN-NEXT: loop: +; MAXLEN-NEXT: Memory dependences are safe with run-time checks +; MAXLEN-NEXT: Dependences: +; MAXLEN-NEXT: Run-time memory checks: +; MAXLEN-NEXT: Check 0: +; MAXLEN-NEXT: Comparing group ([[GRP9:0x[0-9a-f]+]]): +; MAXLEN-NEXT: %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv +; MAXLEN-NEXT: Against group ([[GRP10:0x[0-9a-f]+]]): +; MAXLEN-NEXT: %gep = getelementptr inbounds i8, ptr %A, i64 %iv +; MAXLEN-NEXT: Grouped accesses: +; MAXLEN-NEXT: Group [[GRP9]]: +; MAXLEN-NEXT: (Low: {(15 + %A),+,1}<%outer.header> High: {(271 + %A),+,1}<%outer.header>) +; MAXLEN-NEXT: Member: {{\{\{}}(15 + %A),+,1}<%outer.header>,+,1}<%loop> +; MAXLEN-NEXT: Group [[GRP10]]: +; MAXLEN-NEXT: (Low: %A High: (256 + %A)) +; MAXLEN-NEXT: Member: {%A,+,1}<%loop> +; MAXLEN-EMPTY: +; MAXLEN-NEXT: Non vectorizable stores to invariant address were not found in loop. +; MAXLEN-NEXT: SCEV assumptions: +; MAXLEN-EMPTY: +; MAXLEN-NEXT: Expressions re-written: +; MAXLEN-NEXT: outer.header: +; MAXLEN-NEXT: Report: loop is not the innermost loop +; MAXLEN-NEXT: Dependences: +; MAXLEN-NEXT: Run-time memory checks: +; MAXLEN-NEXT: Grouped accesses: +; MAXLEN-EMPTY: +; MAXLEN-NEXT: Non vectorizable stores to invariant address were not found in loop. +; MAXLEN-NEXT: SCEV assumptions: +; MAXLEN-EMPTY: +; MAXLEN-NEXT: Expressions re-written: ; entry: br label %outer.header @@ -134,38 +198,97 @@ exit: } define void @backward_min_distance_128(ptr %A, i64 %N) { -; COMMON-LABEL: 'backward_min_distance_128' -; COMMON-NEXT: loop: -; COMMON-NEXT: Memory dependences are safe with run-time checks -; COMMON-NEXT: Dependences: -; COMMON-NEXT: Run-time memory checks: -; COMMON-NEXT: Check 0: -; COMMON-NEXT: Comparing group ([[GRP13:0x[0-9a-f]+]]): -; COMMON-NEXT: %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv -; COMMON-NEXT: Against group ([[GRP14:0x[0-9a-f]+]]): -; COMMON-NEXT: %gep = getelementptr inbounds i8, ptr %A, i64 %iv -; COMMON-NEXT: Grouped accesses: -; COMMON-NEXT: Group [[GRP13]]: -; COMMON-NEXT: (Low: {(16 + %A),+,1}<%outer.header> High: {(272 + %A),+,1}<%outer.header>) -; COMMON-NEXT: Member: {{\{\{}}(16 + %A),+,1}<%outer.header>,+,1}<%loop> -; COMMON-NEXT: Group [[GRP14]]: -; COMMON-NEXT: (Low: %A High: (256 + %A)) -; COMMON-NEXT: Member: {%A,+,1}<%loop> -; COMMON-EMPTY: -; COMMON-NEXT: Non vectorizable stores to invariant address were not found in loop. -; COMMON-NEXT: SCEV assumptions: -; COMMON-EMPTY: -; COMMON-NEXT: Expressions re-written: -; COMMON-NEXT: outer.header: -; COMMON-NEXT: Report: loop is not the innermost loop -; COMMON-NEXT: Dependences: -; COMMON-NEXT: Run-time memory checks: -; COMMON-NEXT: Grouped accesses: -; COMMON-EMPTY: -; COMMON-NEXT: Non vectorizable stores to invariant address were not found in loop. -; COMMON-NEXT: SCEV assumptions: -; COMMON-EMPTY: -; COMMON-NEXT: Expressions re-written: +; CHECK-LABEL: 'backward_min_distance_128' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 128 bits +; CHECK-NEXT: Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: %l = load i8, ptr %gep, align 4 -> +; CHECK-NEXT: store i8 %add, ptr %gep.off.iv, align 4 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: outer.header: +; CHECK-NEXT: Report: loop is not the innermost loop +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +; VW128-LABEL: 'backward_min_distance_128' +; VW128-NEXT: loop: +; VW128-NEXT: Memory dependences are safe with run-time checks +; VW128-NEXT: Dependences: +; VW128-NEXT: Run-time memory checks: +; VW128-NEXT: Check 0: +; VW128-NEXT: Comparing group ([[GRP11:0x[0-9a-f]+]]): +; VW128-NEXT: %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv +; VW128-NEXT: Against group ([[GRP12:0x[0-9a-f]+]]): +; VW128-NEXT: %gep = getelementptr inbounds i8, ptr %A, i64 %iv +; VW128-NEXT: Grouped accesses: +; VW128-NEXT: Group [[GRP11]]: +; VW128-NEXT: (Low: {(16 + %A),+,1}<%outer.header> High: {(272 + %A),+,1}<%outer.header>) +; VW128-NEXT: Member: {{\{\{}}(16 + %A),+,1}<%outer.header>,+,1}<%loop> +; VW128-NEXT: Group [[GRP12]]: +; VW128-NEXT: (Low: %A High: (256 + %A)) +; VW128-NEXT: Member: {%A,+,1}<%loop> +; VW128-EMPTY: +; VW128-NEXT: Non vectorizable stores to invariant address were not found in loop. +; VW128-NEXT: SCEV assumptions: +; VW128-EMPTY: +; VW128-NEXT: Expressions re-written: +; VW128-NEXT: outer.header: +; VW128-NEXT: Report: loop is not the innermost loop +; VW128-NEXT: Dependences: +; VW128-NEXT: Run-time memory checks: +; VW128-NEXT: Grouped accesses: +; VW128-EMPTY: +; VW128-NEXT: Non vectorizable stores to invariant address were not found in loop. +; VW128-NEXT: SCEV assumptions: +; VW128-EMPTY: +; VW128-NEXT: Expressions re-written: +; +; MAXLEN-LABEL: 'backward_min_distance_128' +; MAXLEN-NEXT: loop: +; MAXLEN-NEXT: Memory dependences are safe with run-time checks +; MAXLEN-NEXT: Dependences: +; MAXLEN-NEXT: Run-time memory checks: +; MAXLEN-NEXT: Check 0: +; MAXLEN-NEXT: Comparing group ([[GRP13:0x[0-9a-f]+]]): +; MAXLEN-NEXT: %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv +; MAXLEN-NEXT: Against group ([[GRP14:0x[0-9a-f]+]]): +; MAXLEN-NEXT: %gep = getelementptr inbounds i8, ptr %A, i64 %iv +; MAXLEN-NEXT: Grouped accesses: +; MAXLEN-NEXT: Group [[GRP13]]: +; MAXLEN-NEXT: (Low: {(16 + %A),+,1}<%outer.header> High: {(272 + %A),+,1}<%outer.header>) +; MAXLEN-NEXT: Member: {{\{\{}}(16 + %A),+,1}<%outer.header>,+,1}<%loop> +; MAXLEN-NEXT: Group [[GRP14]]: +; MAXLEN-NEXT: (Low: %A High: (256 + %A)) +; MAXLEN-NEXT: Member: {%A,+,1}<%loop> +; MAXLEN-EMPTY: +; MAXLEN-NEXT: Non vectorizable stores to invariant address were not found in loop. +; MAXLEN-NEXT: SCEV assumptions: +; MAXLEN-EMPTY: +; MAXLEN-NEXT: Expressions re-written: +; MAXLEN-NEXT: outer.header: +; MAXLEN-NEXT: Report: loop is not the innermost loop +; MAXLEN-NEXT: Dependences: +; MAXLEN-NEXT: Run-time memory checks: +; MAXLEN-NEXT: Grouped accesses: +; MAXLEN-EMPTY: +; MAXLEN-NEXT: Non vectorizable stores to invariant address were not found in loop. +; MAXLEN-NEXT: SCEV assumptions: +; MAXLEN-EMPTY: +; MAXLEN-NEXT: Expressions re-written: ; entry: br label %outer.header @@ -196,21 +319,73 @@ exit: } define void @backward_min_distance_256(ptr %A, i64 %N) { +; CHECK-LABEL: 'backward_min_distance_256' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 256 bits +; CHECK-NEXT: Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: %l = load i8, ptr %gep, align 4 -> +; CHECK-NEXT: store i8 %add, ptr %gep.off.iv, align 4 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: outer.header: +; CHECK-NEXT: Report: loop is not the innermost loop +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +; VW128-LABEL: 'backward_min_distance_256' +; VW128-NEXT: loop: +; VW128-NEXT: Memory dependences are safe with a maximum safe vector width of 256 bits +; VW128-NEXT: Dependences: +; VW128-NEXT: BackwardVectorizable: +; VW128-NEXT: %l = load i8, ptr %gep, align 4 -> +; VW128-NEXT: store i8 %add, ptr %gep.off.iv, align 4 +; VW128-EMPTY: +; VW128-NEXT: Run-time memory checks: +; VW128-NEXT: Grouped accesses: +; VW128-EMPTY: +; VW128-NEXT: Non vectorizable stores to invariant address were not found in loop. +; VW128-NEXT: SCEV assumptions: +; VW128-EMPTY: +; VW128-NEXT: Expressions re-written: +; VW128-NEXT: outer.header: +; VW128-NEXT: Report: loop is not the innermost loop +; VW128-NEXT: Dependences: +; VW128-NEXT: Run-time memory checks: +; VW128-NEXT: Grouped accesses: +; VW128-EMPTY: +; VW128-NEXT: Non vectorizable stores to invariant address were not found in loop. +; VW128-NEXT: SCEV assumptions: +; VW128-EMPTY: +; VW128-NEXT: Expressions re-written: +; ; MAXLEN-LABEL: 'backward_min_distance_256' ; MAXLEN-NEXT: loop: ; MAXLEN-NEXT: Memory dependences are safe with run-time checks ; MAXLEN-NEXT: Dependences: ; MAXLEN-NEXT: Run-time memory checks: ; MAXLEN-NEXT: Check 0: -; MAXLEN-NEXT: Comparing group ([[GRP17:0x[0-9a-f]+]]): +; MAXLEN-NEXT: Comparing group ([[GRP15:0x[0-9a-f]+]]): ; MAXLEN-NEXT: %gep.off.iv = getelementptr inbounds i8, ptr %gep.off, i64 %iv -; MAXLEN-NEXT: Against group ([[GRP18:0x[0-9a-f]+]]): +; MAXLEN-NEXT: Against group ([[GRP16:0x[0-9a-f]+]]): ; MAXLEN-NEXT: %gep = getelementptr inbounds i8, ptr %A, i64 %iv ; MAXLEN-NEXT: Grouped accesses: -; MAXLEN-NEXT: Group [[GRP17]]: +; MAXLEN-NEXT: Group [[GRP15]]: ; MAXLEN-NEXT: (Low: {(32 + %A),+,1}<%outer.header> High: {(288 + %A),+,1}<%outer.header>) ; MAXLEN-NEXT: Member: {{\{\{}}(32 + %A),+,1}<%outer.header>,+,1}<%loop> -; MAXLEN-NEXT: Group [[GRP18]]: +; MAXLEN-NEXT: Group [[GRP16]]: ; MAXLEN-NEXT: (Low: %A High: (256 + %A)) ; MAXLEN-NEXT: Member: {%A,+,1}<%loop> ; MAXLEN-EMPTY: @@ -229,32 +404,6 @@ define void @backward_min_distance_256(ptr %A, i64 %N) { ; MAXLEN-EMPTY: ; MAXLEN-NEXT: Expressions re-written: ; -; VW128-LABEL: 'backward_min_distance_256' -; VW128-NEXT: loop: -; VW128-NEXT: Memory dependences are safe with a maximum safe vector width of 256 bits -; VW128-NEXT: Dependences: -; VW128-NEXT: BackwardVectorizable: -; VW128-NEXT: %l = load i8, ptr %gep, align 4 -> -; VW128-NEXT: store i8 %add, ptr %gep.off.iv, align 4 -; VW128-EMPTY: -; VW128-NEXT: Run-time memory checks: -; VW128-NEXT: Grouped accesses: -; VW128-EMPTY: -; VW128-NEXT: Non vectorizable stores to invariant address were not found in loop. -; VW128-NEXT: SCEV assumptions: -; VW128-EMPTY: -; VW128-NEXT: Expressions re-written: -; VW128-NEXT: outer.header: -; VW128-NEXT: Report: loop is not the innermost loop -; VW128-NEXT: Dependences: -; VW128-NEXT: Run-time memory checks: -; VW128-NEXT: Grouped accesses: -; VW128-EMPTY: -; VW128-NEXT: Non vectorizable stores to invariant address were not found in loop. -; VW128-NEXT: SCEV assumptions: -; VW128-EMPTY: -; VW128-NEXT: Expressions re-written: -; entry: br label %outer.header diff --git a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis.ll b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis.ll new file mode 100644 index 0000000000000..e42392df3e93e --- /dev/null +++ b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print' -disable-output %s 2>&1 | FileCheck %s + +define void @dependency_check_and_runtime_checks_needed_gepb_is_inbounds_iv2_step5(ptr %a, ptr %b, i64 %offset, i64 %n) { +; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_gepb_is_inbounds_iv2_step5' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.b = getelementptr inbounds float, ptr %b, i64 %iv2 +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[GRP1]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP3:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[GRP2]]): +; CHECK-NEXT: %gep.b = getelementptr inbounds float, ptr %b, i64 %iv2 +; CHECK-NEXT: Against group ([[GRP3]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP1]]: +; CHECK-NEXT: (Low: %a High: ((4 * %n) + %a)) +; CHECK-NEXT: Member: {%a,+,4}<%loop> +; CHECK-NEXT: Group [[GRP2]]: +; CHECK-NEXT: (Low: %b High: (-16 + (20 * %n) + %b)) +; CHECK-NEXT: Member: {%b,+,20}<%loop> +; CHECK-NEXT: Group [[GRP3]]: +; CHECK-NEXT: (Low: ((4 * %offset) + %a) High: ((4 * %offset) + (4 * %n) + %a)) +; CHECK-NEXT: Member: {((4 * %offset) + %a),+,4}<%loop> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv2 = phi i64 [ 0, %entry ], [ %iv2.next, %loop ] + %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv + %l1 = load float, ptr %gep.a.iv, align 4 + %iv.offset = add i64 %iv, %offset + %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset + %l2 = load float, ptr %gep.a.iv.off, align 4 + %ad = fadd fast float %l1, %l2 + store float %ad, ptr %gep.a.iv, align 4 + %gep.b = getelementptr inbounds float, ptr %b, i64 %iv2 + store float 0.0, ptr %gep.b + %iv.next = add nuw nsw i64 %iv, 1 + %iv2.next = add i64 %iv2, 5 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @dependency_check_and_runtime_checks_needed_gepb_not_inbounds_iv2_step5(ptr %a, ptr %b, i64 %offset, i64 %n) { +; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_gepb_not_inbounds_iv2_step5' +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: cannot check memory dependencies at runtime +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv2 = phi i64 [ 0, %entry ], [ %iv2.next, %loop ] + %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv + %l1 = load float, ptr %gep.a.iv, align 4 + %iv.offset = add i64 %iv, %offset + %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset + %l2 = load float, ptr %gep.a.iv.off, align 4 + %ad = fadd fast float %l1, %l2 + store float %ad, ptr %gep.a.iv, align 4 + %gep.b = getelementptr i8, ptr %b, i64 %iv2 + store float 0.0, ptr %gep.b + %iv.next = add nuw nsw i64 %iv, 1 + %iv2.next = add i64 %iv2, 5 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @dependency_check_and_runtime_checks_needed_gepb_is_inbounds_iv2_step_not_constant(ptr %a, ptr %b, i64 %offset, i64 %n, i64 %s) { +; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_gepb_is_inbounds_iv2_step_not_constant' +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: cannot check memory dependencies at runtime +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Equal predicate: %s == 1 +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: [PSE] %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv2: +; CHECK-NEXT: {%b,+,%s}<%loop> +; CHECK-NEXT: --> {%b,+,1}<%loop> +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv2 = phi i64 [ 0, %entry ], [ %iv2.next, %loop ] + %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv + %l1 = load float, ptr %gep.a.iv, align 4 + %iv.offset = add i64 %iv, %offset + %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset + %l2 = load float, ptr %gep.a.iv.off, align 4 + %ad = fadd fast float %l1, %l2 + store float %ad, ptr %gep.a.iv, align 4 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv2 + store float 0.0, ptr %gep.b + %iv.next = add nuw nsw i64 %iv, 1 + %iv2.next = add i64 %iv2, %s + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + + +define void @dependency_check_and_runtime_checks_needed_gepb_not_inbounds_iv2_step_not_constant(ptr %a, ptr %b, i64 %offset, i64 %n, i64 %s) { +; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_gepb_not_inbounds_iv2_step_not_constant' +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: cannot check memory dependencies at runtime +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Equal predicate: %s == 1 +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: [PSE] %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv2: +; CHECK-NEXT: {%b,+,%s}<%loop> +; CHECK-NEXT: --> {%b,+,1}<%loop> +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv2 = phi i64 [ 0, %entry ], [ %iv2.next, %loop ] + %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv + %l1 = load float, ptr %gep.a.iv, align 4 + %iv.offset = add i64 %iv, %offset + %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset + %l2 = load float, ptr %gep.a.iv.off, align 4 + %ad = fadd fast float %l1, %l2 + store float %ad, ptr %gep.a.iv, align 4 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv2 + store float 0.0, ptr %gep.b + %iv.next = add nuw nsw i64 %iv, 1 + %iv2.next = add i64 %iv2, %s + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @dependency_check_and_runtime_checks_needed_gepb_may_wrap(ptr %a, ptr %b, i64 %offset, i64 %n) { +; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_gepb_may_wrap' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP4:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP5:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[GRP4]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP6:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.b = getelementptr float, ptr %b, i64 %iv2 +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[GRP5]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset +; CHECK-NEXT: Against group ([[GRP6]]): +; CHECK-NEXT: %gep.b = getelementptr float, ptr %b, i64 %iv2 +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP4]]: +; CHECK-NEXT: (Low: %a High: ((4 * %n) + %a)) +; CHECK-NEXT: Member: {%a,+,4}<%loop> +; CHECK-NEXT: Group [[GRP5]]: +; CHECK-NEXT: (Low: ((4 * %offset) + %a) High: ((4 * %offset) + (4 * %n) + %a)) +; CHECK-NEXT: Member: {((4 * %offset) + %a),+,4}<%loop> +; CHECK-NEXT: Group [[GRP6]]: +; CHECK-NEXT: (Low: %b High: (-4 + (8 * %n) + %b)) +; CHECK-NEXT: Member: {%b,+,8}<%loop> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%b,+,8}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv2 = phi i64 [ 0, %entry ], [ %iv2.next, %loop ] + %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv + %l1 = load float, ptr %gep.a.iv, align 4 + %iv.offset = add i64 %iv, %offset + %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset + %l2 = load float, ptr %gep.a.iv.off, align 4 + %ad = fadd fast float %l1, %l2 + store float %ad, ptr %gep.a.iv, align 4 + %gep.b = getelementptr float, ptr %b, i64 %iv2 + store float 0.0, ptr %gep.b + %iv.next = add nuw nsw i64 %iv, 1 + %iv2.next = add i64 %iv2, 2 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Assembler/memory-attribute-errors.ll b/llvm/test/Assembler/memory-attribute-errors.ll index 1fba90362e79b..2eed11d9465d5 100644 --- a/llvm/test/Assembler/memory-attribute-errors.ll +++ b/llvm/test/Assembler/memory-attribute-errors.ll @@ -12,16 +12,16 @@ ; MISSING-ARGS: error: expected '(' declare void @fn() memory ;--- empty.ll -; EMPTY: error: expected memory location (argmem, inaccessiblemem) or access kind (none, read, write, readwrite) +; EMPTY: error: expected memory location (argmem, inaccessiblemem, errnomem) or access kind (none, read, write, readwrite) declare void @fn() memory() ;--- unterminated.ll ; UNTERMINATED: error: unterminated memory attribute declare void @fn() memory(read ;--- invalid-kind.ll -; INVALID-KIND: error: expected memory location (argmem, inaccessiblemem) or access kind (none, read, write, readwrite) +; INVALID-KIND: error: expected memory location (argmem, inaccessiblemem, errnomem) or access kind (none, read, write, readwrite) declare void @fn() memory(foo) ;--- other.ll -; OTHER: error: expected memory location (argmem, inaccessiblemem) or access kind (none, read, write, readwrite) +; OTHER: error: expected memory location (argmem, inaccessiblemem, errnomem) or access kind (none, read, write, readwrite) declare void @fn() memory(other: read) ;--- missing-colon.ll ; MISSING-COLON: error: expected ':' after location diff --git a/llvm/test/Assembler/memory-attribute.ll b/llvm/test/Assembler/memory-attribute.ll index 2f7d3980eb378..effd4ce7c4548 100644 --- a/llvm/test/Assembler/memory-attribute.ll +++ b/llvm/test/Assembler/memory-attribute.ll @@ -40,6 +40,18 @@ declare void @fn_inaccessiblemem_write() memory(inaccessiblemem: write) ; CHECK: @fn_inaccessiblemem_readwrite() declare void @fn_inaccessiblemem_readwrite() memory(inaccessiblemem: readwrite) +; CHECK: Function Attrs: memory(errnomem: read) +; CHECK: @fn_errnomem_read() +declare void @fn_errnomem_read() memory(errnomem: read) + +; CHECK: Function Attrs: memory(errnomem: write) +; CHECK: @fn_errnomem_write() +declare void @fn_errnomem_write() memory(errnomem: write) + +; CHECK: Function Attrs: memory(errnomem: readwrite) +; CHECK: @fn_errnomem_readwrite() +declare void @fn_errnomem_readwrite() memory(errnomem: readwrite) + ; CHECK: Function Attrs: memory(read, argmem: readwrite) ; CHECK: @fn_read_argmem_readwrite() declare void @fn_read_argmem_readwrite() memory(read, argmem: readwrite) diff --git a/llvm/test/Bitcode/Inputs/memory-attribute-upgrade.bc b/llvm/test/Bitcode/Inputs/memory-attribute-upgrade.bc new file mode 100644 index 0000000000000..52a38d27b1032 Binary files /dev/null and b/llvm/test/Bitcode/Inputs/memory-attribute-upgrade.bc differ diff --git a/llvm/test/Bitcode/memory-attribute-upgrade.ll b/llvm/test/Bitcode/memory-attribute-upgrade.ll new file mode 100644 index 0000000000000..915b62a88935d --- /dev/null +++ b/llvm/test/Bitcode/memory-attribute-upgrade.ll @@ -0,0 +1,7 @@ +; RUN: llvm-dis < %S/Inputs/memory-attribute-upgrade.bc | FileCheck %s + +; CHECK: ; Function Attrs: memory(write, argmem: read) +; CHECK-NEXT: define void @test_any_write_argmem_read(ptr %p) + +; CHECK: ; Function Attrs: memory(read, argmem: readwrite, inaccessiblemem: none) +; CHECK-NEXT: define void @test_any_read_argmem_readwrite(ptr %p) diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lsfe.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lsfe.ll new file mode 100644 index 0000000000000..992d050cf9ca0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lsfe.ll @@ -0,0 +1,1924 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)" +; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lsfe -O0 | FileCheck %s --check-prefixes=CHECK,-O0 +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lsfe -O1 | FileCheck %s --check-prefixes=CHECK,-O1 + +define dso_local half @atomicrmw_fadd_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_monotonic: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_acquire: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_release: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_acq_rel: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_seq_cst: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_monotonic: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_acquire: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_release: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_acq_rel: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_seq_cst: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_monotonic: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_acquire: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_release: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_acq_rel: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_seq_cst: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_monotonic: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_acquire: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_release: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_acq_rel: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_seq_cst: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_monotonic: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_acquire: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_release: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_acq_rel: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_seq_cst: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_monotonic: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_acquire: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_release: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_acq_rel: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_seq_cst: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_monotonic: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_acquire: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_release: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_acq_rel: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_seq_cst: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_monotonic: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_acquire: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_release: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_acq_rel: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_seq_cst: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_monotonic: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_acquire: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_release: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_acq_rel: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_seq_cst: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_monotonic: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_acquire: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_release: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_acq_rel: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_seq_cst: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_monotonic: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_acquire: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_release: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_acq_rel: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_seq_cst: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_monotonic: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_acquire: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_release: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_acq_rel: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_seq_cst: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value seq_cst, align 1 + ret double %r +} diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a_fp.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a_fp.ll new file mode 100644 index 0000000000000..b9dccdeeb600d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a_fp.ll @@ -0,0 +1,1924 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)" +; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8a -O0 | FileCheck %s --check-prefixes=CHECK,-O0 +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8a -O1 | FileCheck %s --check-prefixes=CHECK,-O1 + +define dso_local half @atomicrmw_fadd_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_monotonic: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_acquire: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_release: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_acq_rel: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_seq_cst: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_monotonic: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_acquire: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_release: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_acq_rel: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_seq_cst: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_monotonic: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_acquire: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_release: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_acq_rel: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_seq_cst: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_monotonic: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_acquire: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_release: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_acq_rel: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_seq_cst: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_monotonic: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_acquire: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_release: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_acq_rel: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_seq_cst: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_monotonic: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_acquire: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_release: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_acq_rel: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_seq_cst: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_monotonic: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_acquire: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_release: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_acq_rel: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_seq_cst: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_monotonic: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_acquire: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_release: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_acq_rel: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_seq_cst: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_monotonic: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_acquire: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_release: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_acq_rel: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_seq_cst: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_monotonic: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_acquire: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_release: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_acq_rel: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_seq_cst: +; -O0: ldaxrh w0, [x9] +; -O0: cmp w0, w10, uxth +; -O0: stlxrh w8, w11, [x9] +; -O0: subs w8, w8, w0, uxth +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_monotonic: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_acquire: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_release: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_acq_rel: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_seq_cst: +; -O0: ldaxr w0, [x9] +; -O0: cmp w0, w10 +; -O0: stlxr w8, w11, [x9] +; -O0: subs w8, w0, w8 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_monotonic: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_acquire: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_release: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_acq_rel: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_seq_cst: +; -O0: ldaxr x0, [x9] +; -O0: cmp x0, x10 +; -O0: stlxr w8, x11, [x9] +; -O0: subs x8, x0, x8 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value seq_cst, align 1 + ret double %r +} diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lsfe.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lsfe.ll new file mode 100644 index 0000000000000..6c46407177297 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lsfe.ll @@ -0,0 +1,1984 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)" +; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lsfe -O0 | FileCheck %s --check-prefixes=CHECK,-O0 +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+lsfe -O1 | FileCheck %s --check-prefixes=CHECK,-O1 + +define dso_local half @atomicrmw_fadd_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_monotonic: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_acquire: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_release: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_acq_rel: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_seq_cst: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_monotonic: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_acquire: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_release: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_acq_rel: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_seq_cst: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_monotonic: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_acquire: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_release: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_acq_rel: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_seq_cst: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_monotonic: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_acquire: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_release: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_acq_rel: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_seq_cst: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_monotonic: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_acquire: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_release: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_acq_rel: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_seq_cst: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_monotonic: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_acquire: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_release: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_acq_rel: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_seq_cst: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_monotonic: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_acquire: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_release: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_acq_rel: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_seq_cst: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_monotonic: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_acquire: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_release: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_acq_rel: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_seq_cst: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_monotonic: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_acquire: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_release: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_acq_rel: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_seq_cst: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_monotonic: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_acquire: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_release: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_acq_rel: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_seq_cst: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_monotonic: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_acquire: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_release: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_acq_rel: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_seq_cst: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_monotonic: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_acquire: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_release: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_acq_rel: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_seq_cst: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value seq_cst, align 1 + ret double %r +} diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a_fp.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a_fp.ll new file mode 100644 index 0000000000000..bdd488e6933e5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a_fp.ll @@ -0,0 +1,1984 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)" +; The base test file was generated by ./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+v8a -O0 | FileCheck %s --check-prefixes=CHECK,-O0 +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64_be -mattr=+v8a -O1 | FileCheck %s --check-prefixes=CHECK,-O1 + +define dso_local half @atomicrmw_fadd_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_monotonic: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_acquire: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_release: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_acq_rel: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fadd_half_aligned_seq_cst: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_monotonic: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_acquire: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_release: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_acq_rel: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fadd_float_aligned_seq_cst: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fadd ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_monotonic: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_acquire: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_release: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_acq_rel: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fadd_double_aligned_seq_cst: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fadd_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fadd ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fadd_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fadd_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fadd_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fadd_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fadd_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fadd_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fadd_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fadd_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fadd ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_monotonic: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_acquire: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_release: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_acq_rel: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fsub_half_aligned_seq_cst: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_monotonic: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_acquire: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_release: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_acq_rel: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fsub_float_aligned_seq_cst: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fsub ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_monotonic: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_acquire: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_release: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_acq_rel: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fsub_double_aligned_seq_cst: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fsub_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fsub ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fsub_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fsub_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fsub_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fsub_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fsub_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fsub_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fsub_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fsub_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fsub ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_monotonic: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_acquire: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_release: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_acq_rel: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmax_half_aligned_seq_cst: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_monotonic: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_acquire: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_release: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_acq_rel: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmax_float_aligned_seq_cst: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmax ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_monotonic: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_acquire: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_release: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_acq_rel: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmax_double_aligned_seq_cst: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmax_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmax ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmax_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmax_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmax_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmax_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmax_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmax_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmax_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmax_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmax ptr %ptr, double %value seq_cst, align 1 + ret double %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_monotonic(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_monotonic: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_monotonic: +; -O1: ldxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value monotonic, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_acquire(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_acquire: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_acquire: +; -O1: ldaxrh w8, [x0] +; -O1: stxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value acquire, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_release(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_release: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_release: +; -O1: ldxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value release, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_acq_rel(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_acq_rel: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_acq_rel: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value acq_rel, align 2 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_aligned_seq_cst(ptr %ptr, half %value) { +; -O0-LABEL: atomicrmw_fmin_half_aligned_seq_cst: +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_half_aligned_seq_cst: +; -O1: ldaxrh w8, [x0] +; -O1: stlxrh w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, half %value seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_monotonic: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_acquire: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_release: +; -O1: ldxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value release, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_acq_rel: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value acq_rel, align 2 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_aligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_aligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: ldaxrh w9, [x11] +; -O0: cmp w9, w8, uxth +; -O0: stlxrh w10, w12, [x11] +; -O0: subs w8, w9, w8, uxth +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_bfloat_aligned_seq_cst: +; -O1: ldaxrh w9, [x0] +; -O1: add w9, w9, w8 +; -O1: add w9, w10, w9 +; -O1: stlxrh w10, w9, [x0] + %r = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 2 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_monotonic(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_monotonic: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_monotonic: +; -O1: ldxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value monotonic, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_acquire(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_acquire: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_acquire: +; -O1: ldaxr w8, [x0] +; -O1: stxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value acquire, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_release(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_release: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_release: +; -O1: ldxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value release, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_acq_rel(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_acq_rel: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_acq_rel: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value acq_rel, align 4 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_aligned_seq_cst(ptr %ptr, float %value) { +; -O0-LABEL: atomicrmw_fmin_float_aligned_seq_cst: +; -O0: ldaxr w9, [x11] +; -O0: cmp w9, w8 +; -O0: stlxr w10, w12, [x11] +; -O0: subs w8, w9, w8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_float_aligned_seq_cst: +; -O1: ldaxr w8, [x0] +; -O1: stlxr w9, w8, [x0] + %r = atomicrmw fmin ptr %ptr, float %value seq_cst, align 4 + ret float %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_monotonic(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_monotonic: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_monotonic: +; -O1: ldxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value monotonic, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_acquire(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_acquire: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_acquire: +; -O1: ldaxr x8, [x0] +; -O1: stxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value acquire, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_release(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_release: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_release: +; -O1: ldxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value release, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_acq_rel(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_acq_rel: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_acq_rel: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value acq_rel, align 8 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_aligned_seq_cst(ptr %ptr, double %value) { +; -O0-LABEL: atomicrmw_fmin_double_aligned_seq_cst: +; -O0: ldaxr x9, [x11] +; -O0: cmp x9, x8 +; -O0: stlxr w10, x12, [x11] +; -O0: subs x8, x9, x8 +; -O0: subs w8, w8, #1 +; +; -O1-LABEL: atomicrmw_fmin_double_aligned_seq_cst: +; -O1: ldaxr x8, [x0] +; -O1: stlxr w9, x8, [x0] + %r = atomicrmw fmin ptr %ptr, double %value seq_cst, align 8 + ret double %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_monotonic(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value monotonic, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_acquire(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value acquire, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_release(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value release, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_acq_rel(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value acq_rel, align 1 + ret half %r +} + +define dso_local half @atomicrmw_fmin_half_unaligned_seq_cst(ptr %ptr, half %value) { +; CHECK-LABEL: atomicrmw_fmin_half_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, half %value seq_cst, align 1 + ret half %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_monotonic(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_monotonic: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value monotonic, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acquire(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acquire: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value acquire, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_release(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_release: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_release: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value release, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_acq_rel(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_acq_rel: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value acq_rel, align 1 + ret bfloat %r +} + +define dso_local bfloat @atomicrmw_fmin_bfloat_unaligned_seq_cst(ptr %ptr, bfloat %value) { +; -O0-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: +; -O0: add w8, w8, w9 +; -O0: add w8, w8, w9 +; -O0: bl __atomic_compare_exchange +; +; -O1-LABEL: atomicrmw_fmin_bfloat_unaligned_seq_cst: +; -O1: add w8, w8, w20 +; -O1: add w8, w9, w8 +; -O1: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, bfloat %value seq_cst, align 1 + ret bfloat %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_monotonic(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value monotonic, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_acquire(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value acquire, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_release(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value release, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_acq_rel(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value acq_rel, align 1 + ret float %r +} + +define dso_local float @atomicrmw_fmin_float_unaligned_seq_cst(ptr %ptr, float %value) { +; CHECK-LABEL: atomicrmw_fmin_float_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, float %value seq_cst, align 1 + ret float %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_monotonic(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_monotonic: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value monotonic, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_acquire(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_acquire: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value acquire, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_release(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_release: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value release, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_acq_rel(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_acq_rel: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value acq_rel, align 1 + ret double %r +} + +define dso_local double @atomicrmw_fmin_double_unaligned_seq_cst(ptr %ptr, double %value) { +; CHECK-LABEL: atomicrmw_fmin_double_unaligned_seq_cst: +; CHECK: bl __atomic_compare_exchange + %r = atomicrmw fmin ptr %ptr, double %value seq_cst, align 1 + ret double %r +} diff --git a/llvm/test/CodeGen/AArch64/Atomics/generate-tests.py b/llvm/test/CodeGen/AArch64/Atomics/generate-tests.py index ecda5fd69ca5d..f40bbaeb930c0 100755 --- a/llvm/test/CodeGen/AArch64/Atomics/generate-tests.py +++ b/llvm/test/CodeGen/AArch64/Atomics/generate-tests.py @@ -2,6 +2,7 @@ import textwrap import enum import os +import re """ Generate the tests in llvm/test/CodeGen/AArch64/Atomics. Run from top level llvm-project. @@ -13,20 +14,31 @@ ] -# Type name size -class Type(enum.Enum): - # Value is the size in bytes - i8 = 1 - i16 = 2 - i32 = 4 - i64 = 8 - i128 = 16 +class ByteSizes: + def __init__(self, pairs): + if not isinstance(pairs, list): + raise ValueError("Must init with a list of key-value pairs") - def align(self, aligned: bool) -> int: - return self.value if aligned else 1 + self._data = pairs[:] + + def __iter__(self): + return iter(self._data) - def __str__(self) -> str: - return self.name + +# fmt: off +Type = ByteSizes([ + ("i8", 1), + ("i16", 2), + ("i32", 4), + ("i64", 8), + ("i128", 16)]) + +FPType = ByteSizes([ + ("half", 2), + ("bfloat", 2), + ("float", 4), + ("double", 8)]) +# fmt: on # Is this an aligned or unaligned access? @@ -115,6 +127,9 @@ class Feature(enum.Flag): rcpc3 = enum.auto() # FEAT_LSE2 + FEAT_LRCPC3 lse2_lse128 = enum.auto() # FEAT_LSE2 + FEAT_LSE128 + def test_scope(): + return "all" + @property def mattr(self): if self == Feature.outline_atomics: @@ -128,6 +143,21 @@ def mattr(self): return "+" + self.name +class FPFeature(enum.Flag): + # Feature names in filenames are determined by the spelling here: + v8a_fp = enum.auto() + lsfe = enum.auto() # FEAT_LSFE + + def test_scope(): + return "atomicrmw" + + @property + def mattr(self): + if self == FPFeature.v8a_fp: + return "+v8a" + return "+" + self.name + + ATOMICRMW_OPS = [ "xchg", "add", @@ -142,11 +172,32 @@ def mattr(self): "umin", ] +FP_ATOMICRMW_OPS = [ + "fadd", + "fsub", + "fmax", + "fmin", +] + + +def relpath(): + # __file__ changed to return absolute path in Python 3.9. Print only + # up to llvm-project (6 levels higher), to avoid unnecessary diffs and + # revealing directory structure of people running this script + top = "../" * 6 + fp = os.path.relpath(__file__, os.path.abspath(os.path.join(__file__, top))) + return fp + -def all_atomicrmw(f): - for op in ATOMICRMW_OPS: +def align(val, aligned: bool) -> int: + return val if aligned else 1 + + +def all_atomicrmw(f, datatype, atomicrmw_ops): + for op in atomicrmw_ops: for aligned in Aligned: - for ty in Type: + for ty, val in datatype: + alignval = align(val, aligned) for ordering in ATOMICRMW_ORDERS: name = f"atomicrmw_{op}_{ty}_{aligned}_{ordering}" instr = "atomicrmw" @@ -154,7 +205,7 @@ def all_atomicrmw(f): textwrap.dedent( f""" define dso_local {ty} @{name}(ptr %ptr, {ty} %value) {{ - %r = {instr} {op} ptr %ptr, {ty} %value {ordering}, align {ty.align(aligned)} + %r = {instr} {op} ptr %ptr, {ty} %value {ordering}, align {alignval} ret {ty} %r }} """ @@ -164,7 +215,8 @@ def all_atomicrmw(f): def all_load(f): for aligned in Aligned: - for ty in Type: + for ty, val in Type: + alignval = align(val, aligned) for ordering in ATOMIC_LOAD_ORDERS: for const in [False, True]: name = f"load_atomic_{ty}_{aligned}_{ordering}" @@ -176,7 +228,7 @@ def all_load(f): textwrap.dedent( f""" define dso_local {ty} @{name}({arg}) {{ - %r = {instr} {ty}, ptr %ptr {ordering}, align {ty.align(aligned)} + %r = {instr} {ty}, ptr %ptr {ordering}, align {alignval} ret {ty} %r }} """ @@ -186,7 +238,8 @@ def all_load(f): def all_store(f): for aligned in Aligned: - for ty in Type: + for ty, val in Type: + alignval = align(val, aligned) for ordering in ATOMIC_STORE_ORDERS: # FIXME stores name = f"store_atomic_{ty}_{aligned}_{ordering}" instr = "store atomic" @@ -194,7 +247,7 @@ def all_store(f): textwrap.dedent( f""" define dso_local void @{name}({ty} %value, ptr %ptr) {{ - {instr} {ty} %value, ptr %ptr {ordering}, align {ty.align(aligned)} + {instr} {ty} %value, ptr %ptr {ordering}, align {alignval} ret void }} """ @@ -204,7 +257,8 @@ def all_store(f): def all_cmpxchg(f): for aligned in Aligned: - for ty in Type: + for ty, val in Type: + alignval = align(val, aligned) for success_ordering in CMPXCHG_SUCCESS_ORDERS: for failure_ordering in CMPXCHG_FAILURE_ORDERS: for weak in [False, True]: @@ -217,7 +271,7 @@ def all_cmpxchg(f): textwrap.dedent( f""" define dso_local {ty} @{name}({ty} %expected, {ty} %new, ptr %ptr) {{ - %pair = {instr} ptr %ptr, {ty} %expected, {ty} %new {success_ordering} {failure_ordering}, align {ty.align(aligned)} + %pair = {instr} ptr %ptr, {ty} %expected, {ty} %new {success_ordering} {failure_ordering}, align {alignval} %r = extractvalue {{ {ty}, i1 }} %pair, 0 ret {ty} %r }} @@ -248,7 +302,8 @@ def header(f, triple, features, filter_args: str): ) f.write(filter_args) f.write("\n") - f.write(f"; The base test file was generated by {__file__}\n") + f.write(f"; The base test file was generated by ./{relpath()}\n") + for feat in features: for OptFlag in ["-O0", "-O1"]: f.write( @@ -273,8 +328,7 @@ def header(f, triple, features, filter_args: str): ) -def write_lit_tests(): - os.chdir("llvm/test/CodeGen/AArch64/Atomics/") +def write_lit_tests(feature, datatypes, ops): for triple in TRIPLES: # Feature has no effect on fence, so keep it to one file. with open(f"{triple}-fence.ll", "w") as f: @@ -282,11 +336,15 @@ def write_lit_tests(): header(f, triple, Feature, filter_args) all_fence(f) - for feat in Feature: + for feat in feature: with open(f"{triple}-atomicrmw-{feat.name}.ll", "w") as f: filter_args = r'--filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"' header(f, triple, [feat], filter_args) - all_atomicrmw(f) + all_atomicrmw(f, datatypes, ops) + + # Floating point atomics only supported for atomicrmw currently + if feature.test_scope() == "atomicrmw": + continue with open(f"{triple}-cmpxchg-{feat.name}.ll", "w") as f: filter_args = r'--filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"' @@ -305,7 +363,9 @@ def write_lit_tests(): if __name__ == "__main__": - write_lit_tests() + os.chdir("llvm/test/CodeGen/AArch64/Atomics/") + write_lit_tests(Feature, Type, ATOMICRMW_OPS) + write_lit_tests(FPFeature, FPType, FP_ATOMICRMW_OPS) print( textwrap.dedent( diff --git a/llvm/test/CodeGen/AArch64/a55-fuse-address.mir b/llvm/test/CodeGen/AArch64/a55-fuse-address.mir index 4edff043a7b3e..3e1b6076f0167 100644 --- a/llvm/test/CodeGen/AArch64/a55-fuse-address.mir +++ b/llvm/test/CodeGen/AArch64/a55-fuse-address.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -o - %s -mtriple=aarch64 -run-pass=machine-scheduler -verify-machineinstrs | FileCheck %s +# RUN: llc -o - %s -mtriple=aarch64 -passes=machine-scheduler | FileCheck %s --- | target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64" diff --git a/llvm/test/CodeGen/AArch64/align-down.ll b/llvm/test/CodeGen/AArch64/align-down.ll index 4b1cdfd2770f6..96e2a5d7d33f7 100644 --- a/llvm/test/CodeGen/AArch64/align-down.ll +++ b/llvm/test/CodeGen/AArch64/align-down.ll @@ -54,10 +54,9 @@ define i32 @t2_commutative(i32 %ptr, i32 %alignment) nounwind { define i32 @t3_extrause0(i32 %ptr, i32 %alignment, ptr %mask_storage) nounwind { ; CHECK-LABEL: t3_extrause0: ; CHECK: // %bb.0: -; CHECK-NEXT: neg w8, w1 -; CHECK-NEXT: sub w9, w1, #1 -; CHECK-NEXT: and w0, w0, w8 -; CHECK-NEXT: str w9, [x2] +; CHECK-NEXT: sub w8, w1, #1 +; CHECK-NEXT: bic w0, w0, w8 +; CHECK-NEXT: str w8, [x2] ; CHECK-NEXT: ret %mask = add i32 %alignment, -1 store i32 %mask, ptr %mask_storage diff --git a/llvm/test/CodeGen/AArch64/ampere1-sched-add.mir b/llvm/test/CodeGen/AArch64/ampere1-sched-add.mir index e578b5d7f04f3..3a33291cbf8e0 100644 --- a/llvm/test/CodeGen/AArch64/ampere1-sched-add.mir +++ b/llvm/test/CodeGen/AArch64/ampere1-sched-add.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 # RUN: llc -run-pass=machine-scheduler %s -o - | FileCheck %s +# RUN: llc -passes=machine-scheduler %s -o - | FileCheck %s --- | target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/CodeGen/AArch64/cfi-fixup-multi-section.mir b/llvm/test/CodeGen/AArch64/cfi-fixup-multi-section.mir index a24972d138832..a4c88be375c01 100644 --- a/llvm/test/CodeGen/AArch64/cfi-fixup-multi-section.mir +++ b/llvm/test/CodeGen/AArch64/cfi-fixup-multi-section.mir @@ -10,9 +10,19 @@ ret i32 0 } + define i32 @f1(i32 %x) #1 { + entry: br label %return + if.end: br label %return + if.then2: br label %return + if.else: br label %return + return: + ret i32 0 + } + declare i32 @g(i32) attributes #0 = { nounwind shadowcallstack uwtable "sign-return-address"="non-leaf" "target-features"="+reserve-x18" } + attributes #1 = { nounwind shadowcallstack uwtable(sync) "sign-return-address"="non-leaf" "target-features"="+reserve-x18" } ... --- @@ -197,4 +207,181 @@ body: | B %bb.7 +... +--- +name: f1 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +failsVerification: false +registers: [] +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: 0 + maxAlignment: 16 + adjustsStack: true + hasCalls: true + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +debugValueSubstitutions: [] +constants: [] +body: | + ; CHECK-LABEL: name: f1 + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.4(0x30000000), %bb.1(0x50000000) + ; CHECK-NEXT: liveins: $w0, $lr, $x18 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: CBZW renamable $w0, %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.if.end: + ; CHECK-NEXT: successors: %bb.3(0x30000000), %bb.2(0x50000000) + ; CHECK-NEXT: liveins: $w0, $lr, $x18 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: early-clobber $x18 = frame-setup STRXpost $lr, $x18, 8 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x16, 0x12, 0x02, 0x82, 0x78 + ; CHECK-NEXT: frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION negate_ra_sign_state + ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $lr, $sp, -16 :: (store (s64) into %stack.0) + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -16 + ; CHECK-NEXT: TBNZW renamable $w0, 31, %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.else: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $w0 = nuw nsw ADDWri killed renamable $w0, 1, 0 + ; CHECK-NEXT: BL @g, csr_aarch64_aapcs_scs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0 + ; CHECK-NEXT: renamable $w8 = MOVZWi 1, 0 + ; CHECK-NEXT: $w0 = SUBWrs killed renamable $w8, killed renamable $w0, 0 + ; CHECK-NEXT: B %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.if.then2 (bbsections 1): + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x16, 0x12, 0x02, 0x82, 0x78 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION negate_ra_sign_state + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -16 + ; CHECK-NEXT: renamable $w0 = nsw SUBWri killed renamable $w0, 1, 0 + ; CHECK-NEXT: BL @g, csr_aarch64_aapcs_scs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0 + ; CHECK-NEXT: renamable $w0 = nsw ADDWri killed renamable $w0, 1, 0 + ; CHECK-NEXT: B %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.return: + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.return: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: B %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.return: + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.0) + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 + ; CHECK-NEXT: frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION negate_ra_sign_state + ; CHECK-NEXT: early-clobber $x18, $lr = frame-destroy LDRXpre $x18, -8 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w18 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.return: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: B %bb.6 + bb.0.entry: + successors: %bb.4(0x30000000), %bb.1(0x50000000) + liveins: $w0, $lr, $x18 + + CBZW renamable $w0, %bb.4 + + bb.1.if.end: + successors: %bb.3(0x30000000), %bb.2(0x50000000) + liveins: $w0, $lr, $x18 + + early-clobber $x18 = frame-setup STRXpost $lr, $x18, 8 + frame-setup CFI_INSTRUCTION escape 0x16, 0x12, 0x02, 0x82, 0x78 + frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp + frame-setup CFI_INSTRUCTION negate_ra_sign_state + early-clobber $sp = frame-setup STRXpre killed $lr, $sp, -16 :: (store (s64) into %stack.0) + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $w30, -16 + TBNZW renamable $w0, 31, %bb.3 + + bb.2.if.else: + successors: %bb.5(0x80000000) + liveins: $w0 + + renamable $w0 = nuw nsw ADDWri killed renamable $w0, 1, 0 + BL @g, csr_aarch64_aapcs_scs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0 + renamable $w8 = MOVZWi 1, 0 + $w0 = SUBWrs killed renamable $w8, killed renamable $w0, 0 + B %bb.5 + + bb.3.if.then2 (bbsections 1): + successors: %bb.5(0x80000000) + liveins: $w0 + + renamable $w0 = nsw SUBWri killed renamable $w0, 1, 0 + BL @g, csr_aarch64_aapcs_scs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0 + renamable $w0 = nsw ADDWri killed renamable $w0, 1, 0 + B %bb.5 + + bb.4.return: + liveins: $w0 + RET undef $lr, implicit killed $w0 + + bb.5.return: + liveins: $w0 + B %bb.6 + + bb.7.return: + liveins: $w0 + early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.0) + frame-destroy CFI_INSTRUCTION def_cfa_offset 0 + frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp + frame-destroy CFI_INSTRUCTION negate_ra_sign_state + early-clobber $x18, $lr = frame-destroy LDRXpre $x18, -8 + frame-destroy CFI_INSTRUCTION restore $w18 + frame-destroy CFI_INSTRUCTION restore $w30 + RET undef $lr, implicit killed $w0 + + bb.6.return: + liveins: $w0 + B %bb.7 + + ... diff --git a/llvm/test/CodeGen/AArch64/cluster-frame-index.mir b/llvm/test/CodeGen/AArch64/cluster-frame-index.mir index 37ab9418f4dbd..5d761f10be3b2 100644 --- a/llvm/test/CodeGen/AArch64/cluster-frame-index.mir +++ b/llvm/test/CodeGen/AArch64/cluster-frame-index.mir @@ -1,4 +1,5 @@ #RUN: llc -mtriple=aarch64-- -mcpu=cyclone -run-pass machine-scheduler -o - %s | FileCheck %s +#RUN: llc -mtriple=aarch64-- -mcpu=cyclone -passes=machine-scheduler -o - %s | FileCheck %s --- name: merge_stack # CHECK-LABEL: name: merge_stack diff --git a/llvm/test/CodeGen/AArch64/dump-reserved-cycles.mir b/llvm/test/CodeGen/AArch64/dump-reserved-cycles.mir index 4bf8afff90d4c..5655bfa5d2945 100644 --- a/llvm/test/CodeGen/AArch64/dump-reserved-cycles.mir +++ b/llvm/test/CodeGen/AArch64/dump-reserved-cycles.mir @@ -1,9 +1,15 @@ # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 -misched-dump-reserved-cycles=true \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 -misched-dump-reserved-cycles=true \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s + # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 -misched-dump-reserved-cycles=false\ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s --check-prefix=NODUMP +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 -misched-dump-reserved-cycles=false\ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s --check-prefix=NODUMP + # REQUIRES: asserts --- name: f diff --git a/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir b/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir index bff6d1d71b7c4..c90d6bd3cb420 100644 --- a/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir +++ b/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir @@ -4,17 +4,34 @@ # RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-header-width=21 \ # RUN: 2>&1 | FileCheck %s --check-prefix=TOP --strict-whitespace +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ +# RUN: -misched-prera-direction=topdown -sched-print-cycles=true \ +# RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-header-width=21 \ +# RUN: 2>&1 | FileCheck %s --check-prefix=TOP --strict-whitespace + # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \ # RUN: -misched-prera-direction=bottomup -sched-print-cycles=true \ # RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-width=4 \ # RUN: 2>&1 | FileCheck %s --check-prefix=BOTTOM --strict-whitespace +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ +# RUN: -misched-prera-direction=bottomup -sched-print-cycles=true \ +# RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-width=4 \ +# RUN: 2>&1 | FileCheck %s --check-prefix=BOTTOM --strict-whitespace + # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \ # RUN: -sched-print-cycles=true -misched-dump-schedule-trace=true \ # RUN: 2>&1 | FileCheck %s --check-prefix=BIDIRECTIONAL +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ +# RUN: -sched-print-cycles=true -misched-dump-schedule-trace=true \ +# RUN: 2>&1 | FileCheck %s --check-prefix=BIDIRECTIONAL + # REQUIRES: asserts, aarch64-registered-target --- name: f diff --git a/llvm/test/CodeGen/AArch64/force-enable-intervals.mir b/llvm/test/CodeGen/AArch64/force-enable-intervals.mir index a53d4e7480307..8d47eee1c8e19 100644 --- a/llvm/test/CodeGen/AArch64/force-enable-intervals.mir +++ b/llvm/test/CodeGen/AArch64/force-enable-intervals.mir @@ -3,11 +3,21 @@ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler \ # RUN: -o - %s 2>&1 -misched-prera-direction=topdown | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ +# RUN: -misched-dump-reserved-cycles=true \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler \ +# RUN: -o - %s 2>&1 -misched-prera-direction=topdown | FileCheck %s + # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ # RUN: -misched-dump-reserved-cycles=true -sched-model-force-enable-intervals=true \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler \ # RUN: -o - %s 2>&1 -misched-prera-direction=topdown | FileCheck %s --check-prefix=FORCE +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ +# RUN: -misched-dump-reserved-cycles=true -sched-model-force-enable-intervals=true \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler \ +# RUN: -o - %s 2>&1 -misched-prera-direction=topdown | FileCheck %s --check-prefix=FORCE + # REQUIRES: asserts, aarch64-registered-target --- name: f diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 9ef6d61c350ec..b2b3430f4d85e 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -5548,3 +5548,151 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { %x = call <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f) ret <16 x i16> %x } + +define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) { +; CHECK-SD-LABEL: test_signed_v2f128_v2i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #96 +; CHECK-SD-NEXT: stp x30, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 96 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w30, -32 +; CHECK-SD-NEXT: mov v2.16b, v1.16b +; CHECK-SD-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill +; CHECK-SD-NEXT: adrp x8, .LCPI86_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI86_0] +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __getf2 +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov w19, w0 +; CHECK-SD-NEXT: bl __fixtfdi +; CHECK-SD-NEXT: adrp x8, .LCPI86_1 +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: cmp w19, #0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI86_1] +; CHECK-SD-NEXT: mov x20, #-9223372036854775808 // =0x8000000000000000 +; CHECK-SD-NEXT: csel x19, x20, x0, lt +; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __gttf2 +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov x21, #9223372036854775807 // =0x7fffffffffffffff +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: csel x19, x21, x19, gt +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: bl __unordtf2 +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: csel x8, xzr, x19, ne +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: bl __getf2 +; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov w19, w0 +; CHECK-SD-NEXT: bl __fixtfdi +; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: cmp w19, #0 +; CHECK-SD-NEXT: csel x19, x20, x0, lt +; CHECK-SD-NEXT: bl __gttf2 +; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: csel x19, x21, x19, gt +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: bl __unordtf2 +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: csel x8, xzr, x19, ne +; CHECK-SD-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: ldp x30, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: add sp, sp, #96 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_signed_v2f128_v2i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #112 +; CHECK-GI-NEXT: stp x30, x23, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 112 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NEXT: adrp x8, .LCPI86_1 +; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI86_1] +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov v1.16b, v2.16b +; CHECK-GI-NEXT: str q2, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __getf2 +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: mov x20, #-4594234569871327232 // =0xc03e000000000000 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x21, x8, x20, lt +; CHECK-GI-NEXT: adrp x8, .LCPI86_0 +; CHECK-GI-NEXT: mov v0.d[1], x21 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI86_0] +; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __gttf2 +; CHECK-GI-NEXT: mov x22, #-1125899906842624 // =0xfffc000000000000 +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: mov x23, #4629137466983448575 // =0x403dffffffffffff +; CHECK-GI-NEXT: csel x8, x19, x22, gt +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: csel x8, x21, x23, gt +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: bl __fixtfdi +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov x19, x0 +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __unordtf2 +; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: csel x21, xzr, x19, ne +; CHECK-GI-NEXT: bl __getf2 +; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, x20, lt +; CHECK-GI-NEXT: mov v0.d[1], x20 +; CHECK-GI-NEXT: bl __gttf2 +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: csel x8, x19, x22, gt +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: csel x8, x20, x23, gt +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: bl __fixtfdi +; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov x19, x0 +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __unordtf2 +; CHECK-GI-NEXT: mov v0.d[0], x21 +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: csel x8, xzr, x19, ne +; CHECK-GI-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x30, x23, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: add sp, sp, #112 +; CHECK-GI-NEXT: ret + %x = call <2 x i64> @llvm.fptosi.sat.v2f128.v2i64(<2 x fp128> %f) + ret <2 x i64> %x +} diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index e1670ad2dc053..b76df6a101e5f 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -4546,3 +4546,121 @@ define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) { %x = call <16 x i16> @llvm.fptoui.sat.v16f64.v16i16(<16 x double> %f) ret <16 x i16> %x } + +define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) { +; CHECK-SD-LABEL: test_signed_v2f128_v2i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #80 +; CHECK-SD-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 80 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: mov v2.16b, v1.16b +; CHECK-SD-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill +; CHECK-SD-NEXT: adrp x8, .LCPI86_0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI86_0] +; CHECK-SD-NEXT: mov v0.16b, v2.16b +; CHECK-SD-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __getf2 +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov w19, w0 +; CHECK-SD-NEXT: bl __fixunstfdi +; CHECK-SD-NEXT: adrp x8, .LCPI86_1 +; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: cmp w19, #0 +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI86_1] +; CHECK-SD-NEXT: csel x19, xzr, x0, lt +; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __gttf2 +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: csinv x8, x19, xzr, le +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: bl __getf2 +; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov w19, w0 +; CHECK-SD-NEXT: bl __fixunstfdi +; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: cmp w19, #0 +; CHECK-SD-NEXT: csel x19, xzr, x0, lt +; CHECK-SD-NEXT: bl __gttf2 +; CHECK-SD-NEXT: cmp w0, #0 +; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: csinv x8, x19, xzr, le +; CHECK-SD-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: add sp, sp, #80 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_signed_v2f128_v2i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #96 +; CHECK-GI-NEXT: stp x30, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 96 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NEXT: adrp x8, .LCPI86_1 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI86_1] +; CHECK-GI-NEXT: stp q2, q1, [sp, #16] // 32-byte Folded Spill +; CHECK-GI-NEXT: mov v1.16b, v2.16b +; CHECK-GI-NEXT: bl __getf2 +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x19, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: csel x20, x8, xzr, lt +; CHECK-GI-NEXT: adrp x8, .LCPI86_0 +; CHECK-GI-NEXT: mov v0.d[1], x20 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI86_0] +; CHECK-GI-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __gttf2 +; CHECK-GI-NEXT: mov x21, #-562949953421312 // =0xfffe000000000000 +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: mov x22, #4629418941960159231 // =0x403effffffffffff +; CHECK-GI-NEXT: csel x8, x19, x21, gt +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: csel x8, x20, x22, gt +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: bl __fixunstfdi +; CHECK-GI-NEXT: ldp q1, q0, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: mov x19, x0 +; CHECK-GI-NEXT: bl __getf2 +; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: csel x20, x8, xzr, lt +; CHECK-GI-NEXT: mov x8, v0.d[1] +; CHECK-GI-NEXT: mov v0.d[0], x20 +; CHECK-GI-NEXT: csel x23, x8, xzr, lt +; CHECK-GI-NEXT: mov v0.d[1], x23 +; CHECK-GI-NEXT: bl __gttf2 +; CHECK-GI-NEXT: cmp w0, #0 +; CHECK-GI-NEXT: csel x8, x20, x21, gt +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: csel x8, x23, x22, gt +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: bl __fixunstfdi +; CHECK-GI-NEXT: mov v0.d[0], x19 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x30, x23, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v0.d[1], x0 +; CHECK-GI-NEXT: add sp, sp, #96 +; CHECK-GI-NEXT: ret + %x = call <2 x i64> @llvm.fptoui.sat.v2f128.v2i64(<2 x fp128> %f) + ret <2 x i64> %x +} diff --git a/llvm/test/CodeGen/AArch64/machine-scheduler.mir b/llvm/test/CodeGen/AArch64/machine-scheduler.mir index 6c0222f4fdd78..ba2c2b33d8e92 100644 --- a/llvm/test/CodeGen/AArch64/machine-scheduler.mir +++ b/llvm/test/CodeGen/AArch64/machine-scheduler.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machine-scheduler -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -passes=machine-scheduler -o - %s | FileCheck %s --- | define i64 @load_imp-def(ptr nocapture %P, i32 %v) { diff --git a/llvm/test/CodeGen/AArch64/macro-fusion-addsub-2reg-const1.mir b/llvm/test/CodeGen/AArch64/macro-fusion-addsub-2reg-const1.mir index 8c5a85a4e7a61..2f0d19fec07d9 100644 --- a/llvm/test/CodeGen/AArch64/macro-fusion-addsub-2reg-const1.mir +++ b/llvm/test/CodeGen/AArch64/macro-fusion-addsub-2reg-const1.mir @@ -1,5 +1,7 @@ # RUN: llc -o - %s -mtriple=aarch64-- -mattr=+fuse-addsub-2reg-const1 -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,FUSION +# RUN: llc -o - %s -mtriple=aarch64-- -mattr=+fuse-addsub-2reg-const1 -passes=postmisched | FileCheck %s --check-prefixes=CHECK,FUSION # RUN: llc -o - %s -mtriple=aarch64-- -mattr=-fuse-addsub-2reg-const1 -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION +# RUN: llc -o - %s -mtriple=aarch64-- -mattr=-fuse-addsub-2reg-const1 -passes=postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION --- # CHECK-LABEL: name: addsub2reg # CHECK: $w8 = ADDWrr killed renamable $w0, killed renamable $w1 diff --git a/llvm/test/CodeGen/AArch64/macro-fusion-last.mir b/llvm/test/CodeGen/AArch64/macro-fusion-last.mir index 14937a4794e96..affd2bb039e96 100644 --- a/llvm/test/CodeGen/AArch64/macro-fusion-last.mir +++ b/llvm/test/CodeGen/AArch64/macro-fusion-last.mir @@ -1,5 +1,7 @@ # RUN: llc -o - %s -mtriple=aarch64-- -mattr=+arith-bcc-fusion -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,FUSION +# RUN: llc -o - %s -mtriple=aarch64-- -mattr=+arith-bcc-fusion -passes=postmisched | FileCheck %s --check-prefixes=CHECK,FUSION # RUN: llc -o - %s -mtriple=aarch64-- -mattr=-arith-bcc-fusion -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION +# RUN: llc -o - %s -mtriple=aarch64-- -mattr=-arith-bcc-fusion -passes=postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION # Make sure the last instruction is correctly macro-fused when scheduling # top-down (post-ra). --- diff --git a/llvm/test/CodeGen/AArch64/misched-branch-targets.mir b/llvm/test/CodeGen/AArch64/misched-branch-targets.mir index 40f148438e537..954082631bdbf 100644 --- a/llvm/test/CodeGen/AArch64/misched-branch-targets.mir +++ b/llvm/test/CodeGen/AArch64/misched-branch-targets.mir @@ -1,6 +1,9 @@ # RUN: llc -o - -run-pass=machine-scheduler -misched=shuffle %s | FileCheck %s # RUN: llc -o - -run-pass=postmisched %s | FileCheck %s +# RUN: llc -o - -passes=machine-scheduler -misched=shuffle %s | FileCheck %s +# RUN: llc -o - -passes=postmisched %s | FileCheck %s + # REQUIRES: asserts # -misched=shuffle is only available with assertions enabled diff --git a/llvm/test/CodeGen/AArch64/misched-bundle.mir b/llvm/test/CodeGen/AArch64/misched-bundle.mir index ac6112e8c60ef..8463cb038a3bc 100644 --- a/llvm/test/CodeGen/AArch64/misched-bundle.mir +++ b/llvm/test/CodeGen/AArch64/misched-bundle.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a510 -run-pass=machine-scheduler -debug-only=machine-scheduler %s -o - 2>&1 | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a510 -passes=machine-scheduler -debug-only=machine-scheduler %s -o - 2>&1 | FileCheck %s # REQUIRES: asserts # CHECK: SU(0): renamable $z0 = LD1H renamable $p0, renamable $x1, renamable $x10 :: (load unknown-size, align 1) diff --git a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir index ea40f9e52dcd6..ca92fa14a3fa8 100644 --- a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir +++ b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir @@ -6,6 +6,14 @@ # RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-header-width=21 \ # RUN: | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -mcpu=cortex-a55 %s -o - 2>&1 \ +# RUN: -misched-dump-reserved-cycles=true \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler \ +# RUN: -misched-prera-direction=bottomup -sched-print-cycles=true \ +# RUN: -misched-detail-resource-booking=true \ +# RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-header-width=21 \ +# RUN: | FileCheck %s + # REQUIRES: asserts, aarch64-registered-target --- | diff --git a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir index 9be91b8a01e86..2b34ca54f1e97 100644 --- a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir +++ b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir @@ -5,6 +5,13 @@ # RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-width=4 \ # RUN: 2>&1 | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ +# RUN: -misched-prera-direction=bottomup -sched-print-cycles=true \ +# RUN: -misched-dump-reserved-cycles=true -misched-detail-resource-booking=true\ +# RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-width=4 \ +# RUN: 2>&1 | FileCheck %s + # REQUIRES: asserts, aarch64-registered-target --- name: f diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-arith-logic.mir b/llvm/test/CodeGen/AArch64/misched-fusion-arith-logic.mir index 62276779d1423..60c0026d39466 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-arith-logic.mir +++ b/llvm/test/CodeGen/AArch64/misched-fusion-arith-logic.mir @@ -1,5 +1,7 @@ # RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mattr=fuse-arith-logic -run-pass=machine-scheduler -misched-print-dags | FileCheck %s +# RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mattr=fuse-arith-logic -passes=machine-scheduler -misched-print-dags | FileCheck %s # RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mcpu=exynos-m4 -run-pass=machine-scheduler -misched-print-dags | FileCheck %s +# RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mcpu=exynos-m4 -passes=machine-scheduler -misched-print-dags | FileCheck %s # REQUIRES: asserts --- diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-cmp.mir b/llvm/test/CodeGen/AArch64/misched-fusion-cmp.mir index b0450c5b8c01b..82498164c6ad5 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-cmp.mir +++ b/llvm/test/CodeGen/AArch64/misched-fusion-cmp.mir @@ -1,4 +1,5 @@ # RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mcpu=cortex-x1 -run-pass=machine-scheduler +# RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mcpu=cortex-x1 -passes=machine-scheduler # Just ensure this doesn't crash. --- diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-crypto-eor.mir b/llvm/test/CodeGen/AArch64/misched-fusion-crypto-eor.mir index 623a8221f5ed2..e661353615726 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-crypto-eor.mir +++ b/llvm/test/CodeGen/AArch64/misched-fusion-crypto-eor.mir @@ -1,6 +1,9 @@ # RUN: llc -o /dev/null %s -run-pass=machine-scheduler -mtriple aarch64-- -mattr=-fuse-aes,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,NOFUSE # RUN: llc -o /dev/null %s -run-pass=machine-scheduler -mtriple aarch64-- -mattr=+fuse-aes,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,FUSEAES # RUN: llc -o /dev/null %s -run-pass=machine-scheduler -mtriple aarch64-- -mattr=+fuse-aes,+fuse-crypto-eor,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,FUSEAES,FUSECRYPTO +# RUN: llc -o /dev/null %s -passes=machine-scheduler -mtriple aarch64-- -mattr=-fuse-aes,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,NOFUSE +# RUN: llc -o /dev/null %s -passes=machine-scheduler -mtriple aarch64-- -mattr=+fuse-aes,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,FUSEAES +# RUN: llc -o /dev/null %s -passes=machine-scheduler -mtriple aarch64-- -mattr=+fuse-aes,+fuse-crypto-eor,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,FUSEAES,FUSECRYPTO # REQUIRES: asserts name: func diff --git a/llvm/test/CodeGen/AArch64/misched-move-imm.mir b/llvm/test/CodeGen/AArch64/misched-move-imm.mir index b5ff01b3c5b13..65608bb5f1a1c 100644 --- a/llvm/test/CodeGen/AArch64/misched-move-imm.mir +++ b/llvm/test/CodeGen/AArch64/misched-move-imm.mir @@ -1,4 +1,5 @@ # RUN: llc -run-pass=machine-scheduler -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 %s -o /dev/null 2>&1 +# RUN: llc -passes=machine-scheduler -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 %s -o /dev/null 2>&1 # Just ensure this doesn't crash. Ensures in the neoverse-v2 # scheduling model we don't attempt to treat the first input # operand of MOVZXi as an immediate operand. diff --git a/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir b/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir index 0b14ceeef9a09..17a6cf7e6faa9 100644 --- a/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir +++ b/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir @@ -1,4 +1,5 @@ # RUN: llc -mcpu=exynos-m5 -mtriple=aarch64 -enable-misched -run-pass=machine-scheduler -debug-only=machine-scheduler %s -o /dev/null 2>&1 | FileCheck %s +# RUN: llc -mcpu=exynos-m5 -mtriple=aarch64 -enable-misched -passes=machine-scheduler -debug-only=machine-scheduler %s -o /dev/null 2>&1 | FileCheck %s # REQUIRES: asserts # CHECK-LABEL: ********** MI Scheduling ********** diff --git a/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir b/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir index b04fd89b796ba..b652d2463fc12 100644 --- a/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir +++ b/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir @@ -3,11 +3,21 @@ # RUN: -misched-prera-direction=topdown -sched-print-cycles=true \ # RUN: -misched-dump-schedule-trace=true --misched-sort-resources-in-trace=true 2>&1 | FileCheck --check-prefix=SORTED %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -verify-machineinstrs \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ +# RUN: -misched-prera-direction=topdown -sched-print-cycles=true \ +# RUN: -misched-dump-schedule-trace=true --misched-sort-resources-in-trace=true 2>&1 | FileCheck --check-prefix=SORTED %s + # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -verify-machineinstrs \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \ # RUN: -misched-prera-direction=topdown -sched-print-cycles=true \ # RUN: -misched-dump-schedule-trace=true --misched-sort-resources-in-trace=false 2>&1 | FileCheck --check-prefix=UNSORTED %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -verify-machineinstrs \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ +# RUN: -misched-prera-direction=topdown -sched-print-cycles=true \ +# RUN: -misched-dump-schedule-trace=true --misched-sort-resources-in-trace=false 2>&1 | FileCheck --check-prefix=UNSORTED %s + # REQUIRES: asserts, aarch64-registered-target --- name: test diff --git a/llvm/test/CodeGen/AArch64/sched-postidxalias.mir b/llvm/test/CodeGen/AArch64/sched-postidxalias.mir index 98ee0fa21b2dd..02256ca30d842 100644 --- a/llvm/test/CodeGen/AArch64/sched-postidxalias.mir +++ b/llvm/test/CodeGen/AArch64/sched-postidxalias.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=aarch64 -mcpu=cortex-a55 -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s +# RUN: llc -mtriple=aarch64 -mcpu=cortex-a55 -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s # REQUIRES: asserts # Both the accesses should have an offset of 0 diff --git a/llvm/test/CodeGen/AArch64/sched-print-cycle.mir b/llvm/test/CodeGen/AArch64/sched-print-cycle.mir index 59c51571df74b..d58037e987773 100644 --- a/llvm/test/CodeGen/AArch64/sched-print-cycle.mir +++ b/llvm/test/CodeGen/AArch64/sched-print-cycle.mir @@ -1,9 +1,15 @@ # RUN: llc -mtriple=arm64-apple-macos -mcpu=apple-m1 -sched-print-cycles=true \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s +# RUN: llc -mtriple=arm64-apple-macos -mcpu=apple-m1 -sched-print-cycles=true \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s + # RUN: llc -mtriple=arm64-apple-macos -mcpu=apple-m1 -sched-print-cycles=false \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s --check-prefix=NOCYCLES +# RUN: llc -mtriple=arm64-apple-macos -mcpu=apple-m1 -sched-print-cycles=false \ +# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s --check-prefix=NOCYCLES + # REQUIRES: asserts --- name: mul_mul diff --git a/llvm/test/CodeGen/AArch64/scheduledag-constreg.mir b/llvm/test/CodeGen/AArch64/scheduledag-constreg.mir index 65ec43407413f..66680af3f856b 100644 --- a/llvm/test/CodeGen/AArch64/scheduledag-constreg.mir +++ b/llvm/test/CodeGen/AArch64/scheduledag-constreg.mir @@ -1,4 +1,5 @@ # RUN: llc -o /dev/null %s -mtriple=aarch64-- -run-pass=machine-scheduler -enable-misched -debug-only=machine-scheduler 2>&1 | FileCheck %s +# RUN: llc -o /dev/null %s -mtriple=aarch64-- -passes=machine-scheduler -enable-misched -debug-only=machine-scheduler 2>&1 | FileCheck %s # REQUIRES: asserts --- | define void @func() { ret void } diff --git a/llvm/test/CodeGen/AArch64/sve-aliasing.mir b/llvm/test/CodeGen/AArch64/sve-aliasing.mir index 3b7c9fefa5277..34a08adc417cf 100644 --- a/llvm/test/CodeGen/AArch64/sve-aliasing.mir +++ b/llvm/test/CodeGen/AArch64/sve-aliasing.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -o - %s -mtriple=aarch64 -run-pass=machine-scheduler -verify-machineinstrs | FileCheck %s +# RUN: llc -o - %s -mtriple=aarch64 -passes=machine-scheduler | FileCheck %s --- name: scalable_v16i1 diff --git a/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir b/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir index 82ee173e12256..1c4093b2feb9b 100644 --- a/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir @@ -1,5 +1,7 @@ # RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o /dev/null %s 2>&1 | FileCheck %s +# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=machine-scheduler -verify-misched -o /dev/null %s 2>&1 | FileCheck %s + # CHECK: *** Bad machine code: No live subrange at use *** # CHECK-NEXT: - function: at_least_one_value_should_be_defined_by_this_mask # CHECK-NEXT: - basic block: %bb.0 diff --git a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir index 0d84dc0bdc53e..1ae544f3c074a 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir +++ b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass machine-scheduler %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=machine-scheduler %s -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: cluster_flat_loads # GCN: FLAT_LOAD_DWORD %0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir index 4945c7020ca18..b38dc4d21c10c 100644 --- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir +++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=machine-scheduler -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=machine-scheduler -o - %s | FileCheck %s # The DBG_VALUE in bb.5 ends a scheduling region, and its uses should # not be tracked like a normal instruction. diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir index 8a1c68b3f6615..156979d6d06a5 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=machine-scheduler %s -o - | FileCheck %s --- | declare void @llvm.dbg.value(metadata, metadata, metadata) #0 diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir index 19071be7ebde4..d415346b49b28 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s # REQUIRES: asserts # CHECK: ********** MI Scheduling ********** diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir index 4f15e0ef68977..170672dc4af64 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s # REQUIRES: asserts # CHECK: All regions recorded, starting actual scheduling. diff --git a/llvm/test/CodeGen/AMDGPU/dumpcode.ll b/llvm/test/CodeGen/AMDGPU/dumpcode.ll new file mode 100644 index 0000000000000..1acec2997aa8f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dumpcode.ll @@ -0,0 +1,30 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=dumpcode -filetype=obj < %s | llvm-objcopy --dump-section .AMDGPU.disasm=- - /dev/null | FileCheck %s -check-prefix=GFX10 + +; GFX10: f: +; GFX10-NEXT: BB0_0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; BF8C0000 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xde ; 7E0602FF 000000DE +; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v4 ; 4A040881 +; GFX10-NEXT: s_mov_b32 s4, 0 ; BE840380 +; GFX10-NEXT: global_store_dword v[0:1], v3, off ; DC708000 007D0300 +; GFX10-NEXT: BB0_1: +; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2 ; 4A0404C1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; 7D840480 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; 8804046A +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; 8A7E047E +; GFX10-NEXT: s_cbranch_execnz "" ; BF890000 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; 887E047E +; GFX10-NEXT: s_setpc_b64 s[30:31] ; BE80201E + +define void @f(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) { +entry: + br label %body +body: + %i = phi i32 [0, %entry], [%inc, %body] + store i32 222, ptr addrspace(1) %out + %cmp = icmp ne i32 %i, %val + %inc = add i32 %i, 1 + br i1 %cmp, label %body, label %end +end: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir b/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir index 962d49df8509e..204912b4d4881 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir +++ b/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=tonga -passes=machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: flat_load_clustering # GCN: FLAT_LOAD_DWORD diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir index d57450baea911..78f21ef6610f2 100644 --- a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir +++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir @@ -1,6 +1,8 @@ # REQUIRES: asserts -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -passes=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s --- | define amdgpu_kernel void @high-RP-reschedule() { ret void } diff --git a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll new file mode 100644 index 0000000000000..200d68b2dc1a9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll @@ -0,0 +1,39 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 --stop-after=si-fix-sgpr-copies < %s | FileCheck %s + +; iglp.opt should not be flagged as clobbering the memory operand for the global_load, and we should be able to +; lower into the scalar version (i.e. should not need to lower into vector version with waterfall loop) +; CHECK-NOT: WATERFALL + +define amdgpu_kernel void @_attn_forward_fp8e5_128x32x64_BW128(ptr addrspace(1) %in, ptr addrspace(3) %out) { +.lr.ph: + br label %1 + +1: ; preds = %1, %.lr.ph + %addr = phi ptr addrspace(1) [ null, %.lr.ph ], [ %gep, %1 ] + %offset = phi i64 [ 0, %.lr.ph ], [ %nextOff, %1 ] + %inc = phi i32 [0, %.lr.ph], [ %incCond, %1 ] + %rsrc = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %addr, i16 0, i32 0, i32 0) + %load = tail call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) + %load.bc = bitcast <2 x i32> %load to <8 x i8> + %load.elem = extractelement <8 x i8> %load.bc, i64 0 + tail call void @llvm.amdgcn.iglp.opt(i32 0) + %vec = insertelement <4 x i8> zeroinitializer, i8 %load.elem, i64 0 + %vec.bc = bitcast <4 x i8> %vec to <2 x half> + %shuff = shufflevector <2 x half> %vec.bc, <2 x half> zeroinitializer, <4 x i32> + %gep = getelementptr i8, ptr addrspace(1) %in, i64 %offset + %unmaskedload49 = load <1 x i64>, ptr addrspace(1) null, align 8 + %nextOff = extractelement <1 x i64> %unmaskedload49, i64 0 + %incCond = add i32 %inc, 1 + %cond = icmp eq i32 %incCond, 32 + br i1 %cond, label %2, label %1 + +2: + store <4 x half> %shuff, ptr addrspace(3) %out, align 8 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32) #0 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 8d380516df8b5..452033f332659 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_mov_b32_e32 v9, s17 ; GCN-NEXT: v_mov_b32_e32 v10, s18 ; GCN-NEXT: v_mov_b32_e32 v11, s19 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 4 ; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -122,7 +122,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN-NEXT: v_mov_b32_e32 v9, s17 ; GCN-NEXT: v_mov_b32_e32 v10, s18 ; GCN-NEXT: v_mov_b32_e32 v11, s19 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 4 ; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -179,7 +179,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -224,7 +224,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -417,7 +417,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -459,7 +459,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 44cb4e803ffad..4628a9c15391b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -19,7 +19,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -39,7 +39,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -135,7 +135,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) @@ -186,7 +186,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 ; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 4 ; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -316,7 +316,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 ; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 4 ; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -383,7 +383,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -430,7 +430,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -475,7 +475,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -776,7 +776,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -813,7 +813,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -855,7 +855,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -892,7 +892,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -919,7 +919,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -939,7 +939,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -971,7 +971,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -992,7 +992,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0) @@ -1022,7 +1022,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -1043,7 +1043,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1) @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 ; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 @@ -1169,7 +1169,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 ; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -1352,7 +1352,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1397,7 +1397,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1717,7 +1717,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -1754,7 +1754,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -1801,7 +1801,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -1838,7 +1838,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -1865,7 +1865,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1885,7 +1885,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1913,7 +1913,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) @@ -1939,7 +1939,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 9a8282231ac15..25b857f8f47dd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -24,7 +24,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -48,7 +48,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -72,7 +72,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -96,7 +96,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -120,7 +120,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -144,7 +144,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -168,7 +168,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -192,7 +192,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -217,7 +217,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -242,7 +242,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -267,7 +267,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -292,7 +292,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -317,7 +317,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -342,7 +342,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -367,7 +367,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -392,7 +392,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -417,7 +417,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -442,7 +442,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -467,7 +467,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -492,7 +492,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -518,7 +518,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -543,7 +543,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -567,7 +567,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -592,7 +592,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -617,7 +617,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -642,7 +642,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -667,7 +667,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -692,7 +692,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -717,7 +717,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -742,7 +742,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -767,7 +767,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -791,7 +791,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -815,7 +815,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -839,7 +839,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -863,7 +863,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -889,7 +889,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -914,7 +914,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -939,7 +939,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -964,7 +964,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -988,7 +988,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1012,7 +1012,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1036,7 +1036,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1060,7 +1060,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1084,7 +1084,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1108,7 +1108,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1132,7 +1132,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1156,7 +1156,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1181,7 +1181,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1206,7 +1206,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1231,7 +1231,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1256,7 +1256,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1280,7 +1280,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1304,7 +1304,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1328,7 +1328,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1352,7 +1352,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v13 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1376,7 +1376,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1400,7 +1400,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1429,7 +1429,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1450,7 +1450,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1471,7 +1471,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1512,7 +1512,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1545,7 +1545,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1574,7 +1574,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1599,7 +1599,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1628,7 +1628,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1653,7 +1653,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1682,7 +1682,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1707,7 +1707,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1728,7 +1728,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, s16 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1757,7 +1757,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1782,7 +1782,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, s24 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1803,7 +1803,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__ ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1825,7 +1825,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1843,7 +1843,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1866,7 +1866,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1885,7 +1885,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1927,7 +1927,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; @@ -1953,7 +1953,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) @@ -1993,7 +1993,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; @@ -2020,7 +2020,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) @@ -2040,7 +2040,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2062,7 +2062,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2083,7 +2083,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2104,7 +2104,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2129,7 +2129,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2153,7 +2153,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2176,7 +2176,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2199,7 +2199,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2223,7 +2223,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2247,7 +2247,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2271,7 +2271,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2295,7 +2295,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2318,7 +2318,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2341,7 +2341,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2365,4 +2365,4 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } -attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } +attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 05f8739e7cb89..3d959393a8fa7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -40,7 +40,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -85,7 +85,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -137,7 +137,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -182,7 +182,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -234,7 +234,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -279,7 +279,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -331,7 +331,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -376,7 +376,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -428,7 +428,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -473,7 +473,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -525,7 +525,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -570,7 +570,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -622,7 +622,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -667,7 +667,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -719,7 +719,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -764,7 +764,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -815,7 +815,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -868,7 +868,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -913,7 +913,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -963,7 +963,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1015,7 +1015,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1064,7 +1064,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1116,7 +1116,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1165,7 +1165,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1215,7 +1215,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1264,7 +1264,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1317,7 +1317,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1362,7 +1362,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1412,7 +1412,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1465,7 +1465,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1510,7 +1510,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1561,7 +1561,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1613,7 +1613,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1662,7 +1662,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1714,7 +1714,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1763,7 +1763,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1813,7 +1813,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1862,7 +1862,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1914,7 +1914,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1963,7 +1963,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2015,7 +2015,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2064,7 +2064,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2113,7 +2113,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2161,7 +2161,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2210,7 +2210,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2258,7 +2258,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2311,7 +2311,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2360,7 +2360,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2412,7 +2412,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2461,7 +2461,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2510,7 +2510,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2558,7 +2558,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2607,7 +2607,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2655,7 +2655,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2704,7 +2704,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2752,7 +2752,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2801,7 +2801,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2849,7 +2849,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2899,7 +2899,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2948,7 +2948,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2998,7 +2998,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3047,7 +3047,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3096,7 +3096,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3144,7 +3144,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3193,7 +3193,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3241,7 +3241,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3290,7 +3290,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3338,7 +3338,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3393,7 +3393,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3441,7 +3441,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3489,7 +3489,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, s0 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3567,7 +3567,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3637,7 +3637,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3691,7 +3691,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3741,7 +3741,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3795,7 +3795,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3845,7 +3845,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3899,7 +3899,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3949,7 +3949,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3995,7 +3995,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4041,7 +4041,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4111,7 +4111,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4177,7 +4177,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4224,7 +4224,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__ ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 33, -2 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4272,7 +4272,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, -2 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4316,7 +4316,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, -2 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4365,7 +4365,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v31 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4410,7 +4410,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4477,7 +4477,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 @@ -4520,7 +4520,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 @@ -4576,7 +4576,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -4619,7 +4619,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -4765,7 +4765,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 @@ -4912,7 +4912,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 @@ -5059,7 +5059,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 @@ -5206,7 +5206,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 @@ -5247,7 +5247,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5294,7 +5294,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5341,7 +5341,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 1 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5388,7 +5388,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a( ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 1, 0 op_sel_hi:[0,0,0] ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5441,7 +5441,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5486,7 +5486,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5538,7 +5538,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5583,7 +5583,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5634,7 +5634,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5678,7 +5678,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5727,7 +5727,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5779,7 +5779,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5824,7 +5824,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5876,7 +5876,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5921,7 +5921,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5972,7 +5972,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6023,7 +6023,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6074,7 +6074,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6118,7 +6118,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6167,7 +6167,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6204,4 +6204,4 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } -attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 66c02a9bd0c6a..6b922fcd9b550 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -33,7 +33,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -59,7 +59,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] ; GISEL-NEXT: s_endpgm bb: @@ -81,7 +81,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> % ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -92,7 +92,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> % ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -112,7 +112,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -123,7 +123,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -143,7 +143,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -154,7 +154,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -187,7 +187,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, < ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -246,7 +246,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -279,7 +279,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -317,7 +317,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -389,7 +389,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -461,7 +461,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -561,7 +561,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -650,7 +650,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; GCN-NEXT: s_endpgm bb: @@ -672,7 +672,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -692,7 +692,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -712,7 +712,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1 ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -745,7 +745,7 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0 ; GCN-NEXT: v_mov_b32_e32 v12, s28 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -788,7 +788,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -826,7 +826,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -871,7 +871,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, < ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -916,7 +916,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, < ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -989,7 +989,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1046,7 +1046,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -1072,7 +1072,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: @@ -1094,7 +1094,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1105,7 +1105,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1125,7 +1125,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1136,7 +1136,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1156,7 +1156,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1167,7 +1167,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32 ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1200,7 +1200,7 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1265,7 +1265,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] @@ -1298,7 +1298,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -1336,7 +1336,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1408,7 +1408,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1480,7 +1480,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1580,7 +1580,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1675,7 +1675,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -1701,7 +1701,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: @@ -1723,7 +1723,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1734,7 +1734,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1754,7 +1754,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1765,7 +1765,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1785,7 +1785,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1796,7 +1796,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1829,7 +1829,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1890,7 +1890,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -1916,7 +1916,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: @@ -1938,7 +1938,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1949,7 +1949,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -1969,7 +1969,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1980,7 +1980,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2000,7 +2000,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2011,7 +2011,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2044,7 +2044,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2105,7 +2105,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -2131,7 +2131,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: @@ -2153,7 +2153,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2164,7 +2164,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2184,7 +2184,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2195,7 +2195,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2215,7 +2215,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2226,7 +2226,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2259,7 +2259,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2320,7 +2320,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -2346,7 +2346,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: @@ -2368,7 +2368,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2379,7 +2379,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2399,7 +2399,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2410,7 +2410,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2430,7 +2430,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2441,7 +2441,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, < ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 +; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GISEL-NEXT: v_mov_b32_e32 v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v2, v14 @@ -2474,7 +2474,7 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2539,7 +2539,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] @@ -2572,7 +2572,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -2610,7 +2610,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2682,7 +2682,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2754,7 +2754,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2854,7 +2854,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2953,7 +2953,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] @@ -2986,7 +2986,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -3024,7 +3024,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3096,7 +3096,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3168,7 +3168,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3268,7 +3268,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3367,7 +3367,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] @@ -3400,7 +3400,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -3438,7 +3438,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3510,7 +3510,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3582,7 +3582,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3682,7 +3682,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3781,7 +3781,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] @@ -3814,7 +3814,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -3852,7 +3852,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3924,7 +3924,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3996,7 +3996,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4096,7 +4096,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4156,4 +4156,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ret <16 x float> %result } -attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll new file mode 100644 index 0000000000000..684b59c66ee8e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id-unsupported-calling-convention.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -O0 -stop-after=amdgpu-isel -o - %s | FileCheck --check-prefix=SelDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stop-after=legalizer -o - %s | FileCheck --check-prefix=GlobalISel %s + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.workitem.id.y() +declare i32 @llvm.amdgcn.workitem.id.z() + +define amdgpu_ps void @undefined_workitems(ptr %p, ptr %q, ptr %r) { + ; SelDAG-LABEL: name: undefined_workitems + ; SelDAG: bb.0 (%ir-block.0): + ; SelDAG-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; SelDAG-NEXT: {{ $}} + ; SelDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; SelDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; SelDAG-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; SelDAG-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; SelDAG-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; SelDAG-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; SelDAG-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; SelDAG-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; SelDAG-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; SelDAG-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; SelDAG-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; SelDAG-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]] + ; SelDAG-NEXT: S_ENDPGM 0 + ; + ; GlobalISel-LABEL: name: undefined_workitems + ; GlobalISel: bb.1 (%ir-block.0): + ; GlobalISel-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GlobalISel-NEXT: {{ $}} + ; GlobalISel-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GlobalISel-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GlobalISel-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GlobalISel-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GlobalISel-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GlobalISel-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GlobalISel-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GlobalISel-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GlobalISel-NEXT: [[MV2:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GlobalISel-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GlobalISel-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GlobalISel-NEXT: G_STORE [[COPY6]](s32), [[MV]](p0) :: (store (s32) into %ir.p) + ; GlobalISel-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) + ; GlobalISel-NEXT: G_STORE [[COPY7]](s32), [[MV1]](p0) :: (store (s32) into %ir.q) + ; GlobalISel-NEXT: G_STORE [[DEF]](s32), [[MV2]](p0) :: (store (s32) into %ir.r) + ; GlobalISel-NEXT: S_ENDPGM 0 + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + store i32 %id.x, ptr %p + %id.y = call i32 @llvm.amdgcn.workitem.id.y() + store i32 %id.y, ptr %q + %id.z = call i32 @llvm.amdgcn.workitem.id.z() + store i32 %id.z, ptr %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll index e7589690cd670..99fcbc595ff7f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll @@ -59,6 +59,24 @@ define <2 x ptr addrspace(7)> @gep_vector_scalar(<2 x ptr addrspace(7)> %in, i64 ret <2 x ptr addrspace(7)> %ret } +define <2 x ptr addrspace(7)> @gep_scalar_vector(ptr addrspace(7) %in, <2 x i32> %idxs) { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @gep_scalar_vector +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[IN:%.*]], <2 x i32> [[IDXS:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[IN_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[IN]], 0 +; CHECK-NEXT: [[IN_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[IN]], 1 +; CHECK-NEXT: [[IN_RSRC_SPLATINSERT:%.*]] = insertelement <2 x ptr addrspace(8)> poison, ptr addrspace(8) [[IN_RSRC]], i64 0 +; CHECK-NEXT: [[IN_RSRC_SPLAT:%.*]] = shufflevector <2 x ptr addrspace(8)> [[IN_RSRC_SPLATINSERT]], <2 x ptr addrspace(8)> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[IN_OFF_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[IN_OFF]], i64 0 +; CHECK-NEXT: [[IN_OFF_SPLAT:%.*]] = shufflevector <2 x i32> [[IN_OFF_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[RET:%.*]] = add <2 x i32> [[IN_OFF_SPLAT]], [[IDXS]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } poison, <2 x ptr addrspace(8)> [[IN_RSRC_SPLAT]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP1]], <2 x i32> [[RET]], 1 +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP2]] +; + %ret = getelementptr inbounds i8, ptr addrspace(7) %in, <2 x i32> %idxs + ret <2 x ptr addrspace(7)> %ret +} + define ptr addrspace(7) @simple_gep(ptr addrspace(7) %ptr, i32 %off) { ; CHECK-LABEL: define { ptr addrspace(8), i32 } @simple_gep ; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]], i32 [[OFF:%.*]]) #[[ATTR0]] { diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir index e32de1e42aac4..5dc6d2ee8f695 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s # REQUIRES: asserts --- | diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index fb65d80c46e06..9991cb1837e01 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule %s -o - | FileCheck -check-prefix=GFX908 %s --- name: test_occ_10_max_occ_no_sink diff --git a/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir b/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir index 2aa430400e49a..ffc86dc5eee6f 100644 --- a/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir +++ b/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=tahiti -passes=machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: cluster_add_addc # GCN: S_NOP 0, implicit-def $vcc diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 819b6ca98b3a8..a7060e4f198f1 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s @@ -329,14 +330,23 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %src0, half %src1, half %src2) #0 { -; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; GFX9: ; %bb.0: @@ -363,6 +373,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half % ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 95d579be04ed2..dd4ad0a70b254 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s @@ -268,11 +269,19 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2 } define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 { -; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GFX900: ; %bb.0: @@ -303,6 +312,12 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -329,13 +344,21 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr } define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 { -; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; GFX900: ; %bb.0: @@ -368,6 +391,14 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -389,14 +420,24 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src ; operation only clobbers relevant lane. define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v2f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v4, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_v2f32: ; GFX900: ; %bb.0: @@ -453,6 +494,15 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v2f32: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -492,15 +542,26 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half } define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v3f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX1100-NEXT: v_mov_b32_e32 v0, v6 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v7, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v6 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v3f32: ; SDAG-GFX900: ; %bb.0: @@ -573,6 +634,16 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v3f32: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v3f32: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -640,17 +711,32 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half } define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v4f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.h, v7.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v8, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v6, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v4f32: ; SDAG-GFX900: ; %bb.0: @@ -742,6 +828,18 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v4f32: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v4f32: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -826,14 +924,27 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; FIXME (DAG): Fold clamp define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GFX900: ; %bb.0: @@ -890,6 +1001,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -948,18 +1068,36 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; FIXME (GIsel): V_PK_MAX clamp could be folded into mixlo define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { -; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt: -; SDAG-GFX1100: ; %bb.0: -; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, v6 clamp -; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v2, v3, v5, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, 0 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v0, v2, v2 clamp +; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v0, v6, v6 clamp +; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX900: ; %bb.0: @@ -1146,18 +1284,36 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s } define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; SDAG-GFX1100: ; %bb.0: -; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, v6 clamp -; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v7, v7 clamp -; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v7.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v6, v7, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v2, v2 clamp +; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 clamp +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v0, v6, v6 clamp +; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v1, v7, v7 clamp +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; SDAG-GFX900: ; %bb.0: @@ -1383,14 +1539,28 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; a build_vector to select the mixhi. Issue is more specifically with how insert_vector_elt is being ; legalized (bitwise ops instead of shuffle/build_vector for instance). define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: -; SDAG-GFX1100: ; %bb.0: -; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; SDAG-GFX900: ; %bb.0: @@ -1539,14 +1709,27 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> } define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: -; SDAG-GFX1100: ; %bb.0: -; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: ; SDAG-GFX900: ; %bb.0: @@ -1702,17 +1885,32 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; FIXME (DAG): Should be able to use mixlo/mixhi define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_precvt: -; SDAG-GFX1100: ; %bb.0: -; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v0, v1 -; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_precvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v3, v4, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_precvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt: ; SDAG-GFX900: ; %bb.0: @@ -1848,19 +2046,37 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr ; FIXME (DAG): Handling undef 4th component define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { -; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_precvt: -; SDAG-GFX1100: ; %bb.0: -; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v0, v2 -; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32_clamp_precvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v6, v7, v8 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_precvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v2 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt: ; SDAG-GFX900: ; %bb.0: @@ -2028,23 +2244,48 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr } define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_precvt: -; SDAG-GFX1100: ; %bb.0: -; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v6 -; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v3, v7 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v0, v3 -; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, v2 -; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_precvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v9.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v8, v9, v10 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v3, v6, v7, v11 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v3 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.h, v1.l +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_precvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt: ; SDAG-GFX900: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index 30e3bc3ba5da8..4c2a16c17b38a 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100,SDAG-GFX1100-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100,SDAG-GFX1100-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx9-generic -verify-machineinstrs --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX9GEN,SDAG-GFX9GEN %s @@ -197,14 +198,26 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> % } define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v2f32: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v3, v3, v4, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32: ; SDAG-GFX900: ; %bb.0: @@ -268,6 +281,15 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal ; SDAG-CI-NEXT: v_mac_f32_e32 v0, v4, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX900-LABEL: v_mad_mix_v2f32: ; GISEL-GFX900: ; %bb.0: ; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -330,14 +352,24 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal } define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v2f32_shuffle: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] -; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_shuffle: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v4, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_shuffle: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_v2f32_shuffle: ; GFX900: ; %bb.0: @@ -396,6 +428,15 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, ; SDAG-CI-NEXT: v_mad_f32 v1, v4, v3, v5 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_shuffle: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1214,15 +1255,28 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { } define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { -; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imm1: -; SDAG-GFX1100: ; %bb.0: -; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-NEXT: s_mov_b32 s0, 1.0 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_f32imm1: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: s_mov_b32 s0, 1.0 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v2, v3, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_f32imm1: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: s_mov_b32 s0, 1.0 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imm1: ; SDAG-GFX900: ; %bb.0: @@ -1346,15 +1400,28 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) } define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { -; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: -; SDAG-GFX1100: ; %bb.0: -; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0x3e230000 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: s_mov_b32 s0, 0x3e230000 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v2, v3, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: s_mov_b32 s0, 0x3e230000 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: ; SDAG-GFX900: ; %bb.0: @@ -1485,15 +1552,28 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> } define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { -; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi: -; SDAG-GFX1100: ; %bb.0: -; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0.15915494 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] -; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v2 -; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: s_mov_b32 s0, 0.15915494 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v2, v3, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: s_mov_b32 s0, 0.15915494 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi: ; SDAG-GFX900: ; %bb.0: @@ -1887,16 +1967,30 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, fl } define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, half %src2) #1 { -; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX1100-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v0.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mul_f32_e32 v0, v2, v0 +; SDAG-GFX1100-TRUE16-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-GFX1100-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: ; GFX900: ; %bb.0: @@ -1945,6 +2039,17 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, ; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1963,15 +2068,27 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, } define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, float %src2) #1 { -; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h +; SDAG-GFX1100-TRUE16-NEXT: v_mul_f32_e32 v0, v1, v0 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-GFX1100-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: ; GFX900: ; %bb.0: @@ -2016,6 +2133,16 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half ; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2229,14 +2356,23 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1 ; Make sure we don't fold pre-cvt fneg if we already have a fabs define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { -; GFX1100-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: ; GFX900: ; %bb.0: @@ -2280,6 +2416,15 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half % ; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir index 1eb7ec4c142f2..0af37ad8c896e 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir @@ -734,7 +734,8 @@ body: | ... # GCN-LABEL: name: smfmac16x16_write_vgpr_flat_read # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: FLAT_STORE_DWORD name: smfmac16x16_write_vgpr_flat_read body: | @@ -745,7 +746,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_flat_read # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: FLAT_STORE_DWORD name: xdl_smfma16x16_write_vgpr_flat_read body: | @@ -756,7 +758,8 @@ body: | # GCN-LABEL: name: smfmac32x32_write_vgpr_flat_read # GCN: V_SMFMAC # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: FLAT_STORE_DWORD name: smfmac32x32_write_vgpr_flat_read body: | @@ -768,7 +771,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: FLAT_STORE_DWORD name: xdl_smfma32x32_write_vgpr_flat_read body: | @@ -823,7 +827,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_read # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 name: xdl_smfma16x16_write_vgpr_valu_read body: | @@ -835,7 +840,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 name: xdl_smfma32x32_write_vgpr_valu_read body: | @@ -881,7 +887,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_accv_read # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: xdl_smfma16x16_write_vgpr_accv_read body: | @@ -893,7 +900,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: xdl_smfma32x32_write_vgpr_accv_read body: | @@ -950,7 +958,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_write # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 name: xdl_smfma16x16_write_vgpr_valu_write body: | @@ -962,7 +971,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 name: xdl_smfma32x32_write_vgpr_valu_write body: | @@ -983,7 +993,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_f16_write # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_FMA_F16_e64 name: xdl_smfma16x16_write_vgpr_valu_f16_write body: | @@ -995,7 +1006,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_FMA_F16_e64 name: xdl_smfma32x32_write_vgpr_valu_f16_write body: | @@ -1016,7 +1028,8 @@ body: | # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_sdwa_write # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32_sdwa name: xdl_smfma16x16_write_vgpr_valu_sdwa_write body: | @@ -1028,7 +1041,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32_sdwa name: xdl_smfma32x32_write_vgpr_valu_sdwa_write body: | @@ -1752,7 +1766,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MOV_B32 name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_write body: | @@ -1762,7 +1777,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_vm_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: BUFFER_STORE_DWORD name: xdl_sgemm16X16X16_mfma_write_vgpr_vm_read body: | @@ -1772,7 +1788,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MOV_B32 name: xdl_sgemm16X16X16_mfma_write_vgpr_valu_read body: | @@ -1782,7 +1799,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_dot_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_DOT name: xdl_sgemm16X16X16_mfma_write_vgpr_dot_read body: | @@ -2060,7 +2078,8 @@ body: | ... # GCN-LABEL: name: smfmac16x16_read_vgpr_srcc_valu_write # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MOV_B32 name: smfmac16x16_read_vgpr_srcc_valu_write body: | @@ -2090,7 +2109,8 @@ body: | # GCN-LABEL: name: smfmac32x32_read_vgpr_srcc_valu_write # GCN: V_SMFMAC # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 name: smfmac32x32_read_vgpr_srcc_valu_write body: | diff --git a/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir b/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir index 057769372c041..d4a20c1074a95 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -run-pass=peephole-opt -o - %s | FileCheck %s +# %2.sub0 -> %0.sub0 --- name: reg_sequence_extract_subreg_sub0_from_regsequence_sub0_sub1 tracksRegLiveness: true @@ -24,6 +25,7 @@ body: | ... +# %2.sub1 -> %1 --- name: reg_sequence_extract_subreg_sub1_from_regsequence_sub0_sub1 tracksRegLiveness: true @@ -47,6 +49,7 @@ body: | ... +# %2.sub0 -> %0.sub0 --- name: reg_sequence_extract_subreg_sub0_from_regsequence_sub1_sub0 tracksRegLiveness: true @@ -70,6 +73,7 @@ body: | ... +# %2.sub1 -> %1 --- name: reg_sequence_extract_subreg_sub1_from_regsequence_sub1_sub0 tracksRegLiveness: true @@ -93,6 +97,7 @@ body: | ... +# %2.sub0 -> %0.sub0 --- name: reg_sequence_extract_subreg_sub0_from_vreg96 tracksRegLiveness: true @@ -116,6 +121,7 @@ body: | ... +# %2.sub0 -> %0.sub0 --- name: reg_sequence_extract_subreg_sub1_from_vreg96 tracksRegLiveness: true @@ -139,6 +145,133 @@ body: | ... +--- +name: reg_sequence_v128_compose_reg_sequence64_x2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: reg_sequence_v128_compose_reg_sequence64_x2 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY4]] + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_128 = REG_SEQUENCE %4, %subreg.sub0_sub1, %5, %subreg.sub2_sub3 + %7:vgpr_32 = COPY %6.sub2 + S_ENDPGM 0, implicit %7 + +... + +--- +name: reg_sequence_v128_compose_reg_sequence128_subreg64_extract32_sub2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: reg_sequence_v128_compose_reg_sequence128_subreg64_extract32_sub2 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub0_sub1, %subreg.sub0, [[COPY1]], %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + %2:vreg_128 = REG_SEQUENCE %0.sub0_sub1, %subreg.sub0, %1, %subreg.sub2_sub3 + %3:vgpr_32 = COPY %2.sub2 + S_ENDPGM 0, implicit %3 + +... + +--- +name: reg_sequence_v128_align2_compose_reg_sequence128_subreg64_extract32_sub2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: reg_sequence_v128_align2_compose_reg_sequence128_subreg64_extract32_sub2 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]].sub0_sub1, %subreg.sub0, [[COPY1]], %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + %2:vreg_128_align2 = REG_SEQUENCE %0.sub0_sub1, %subreg.sub0, %1, %subreg.sub2_sub3 + %3:vgpr_32 = COPY %2.sub2 + S_ENDPGM 0, implicit %3 + +... + +# Unhandled, spans multiple sources +--- +name: reg_sequence_v128_align2_compose_reg_sequence128_subreg64_align2_extract64_sub1_sub2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: reg_sequence_v128_align2_compose_reg_sequence128_subreg64_align2_extract64_sub1_sub2 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]].sub0_sub1, %subreg.sub0, [[COPY1]], %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub1_sub2 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + %2:vreg_128_align2 = REG_SEQUENCE %0.sub0_sub1, %subreg.sub0, %1, %subreg.sub2_sub3 + %3:vreg_64_align2 = COPY %2.sub1_sub2 + S_ENDPGM 0, implicit %3 + +... + +# Unhandled, spans multiple sources +--- +name: reg_sequence_v128_align2_compose_reg_sequence128_subreg64_align2_extract64_sub2_sub3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: reg_sequence_v128_align2_compose_reg_sequence128_subreg64_align2_extract64_sub2_sub3 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]].sub0_sub1, %subreg.sub0, [[COPY1]], %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub1_sub2 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + %2:vreg_128_align2 = REG_SEQUENCE %0.sub0_sub1, %subreg.sub0, %1, %subreg.sub2_sub3 + %3:vreg_64_align2 = COPY %2.sub1_sub2 + S_ENDPGM 0, implicit %3 + +... + + +# %2.sub1 -> %0.sub1 --- name: reg_sequence_compose_0 tracksRegLiveness: true @@ -162,6 +295,9 @@ body: | ... +# %2.sub0 -> %0.sub2 +# %2.sub1 -> %0.sub3 +# %2.sub2 -> %0.sub0 --- name: reg_sequence_compose_1 tracksRegLiveness: true @@ -175,15 +311,362 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY]].sub0, %subreg.sub2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_96 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub0_sub1, %0.sub0, %subreg.sub2 + %3:vgpr_32 = COPY %2.sub0 + %4:vgpr_32 = COPY %2.sub1 + %5:vgpr_32 = COPY %2.sub2 + S_ENDPGM 0, implicit %3, implicit %4, implicit %5 + +... + +# %3.sub0 -> %0.sub2 +--- +name: reg_sequence_compose_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4 + + ; CHECK-LABEL: name: reg_sequence_compose_2 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY]].sub0, %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %3:vreg_128 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub0_sub1, %0.sub0, %subreg.sub2, %2, %subreg.sub3 + %4:vgpr_32 = COPY %3.sub0 + S_ENDPGM 0, implicit %4 + +... + +# %3.sub1 -> %0.sub3 +# %3.sub0 -> %0.sub2 +--- +name: reg_sequence_compose_3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4 + + ; CHECK-LABEL: name: reg_sequence_compose_3 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY]].sub0, %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]], implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %3:vreg_128 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub0_sub1, %0.sub0, %subreg.sub2, %2, %subreg.sub3 + %4:vgpr_32 = COPY %3.sub1 + %5:vgpr_32 = COPY %3.sub0 + S_ENDPGM 0, implicit %4, implicit %5 + +... + +# %3.sub2 -> %0.sub0 +--- +name: reg_sequence_compose_4 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4 + + ; CHECK-LABEL: name: reg_sequence_compose_4 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY]].sub0, %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %2:vreg_96 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub0_sub1, %0.sub0, %subreg.sub2 + %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %3:vreg_128 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub0_sub1, %0.sub0, %subreg.sub2, %2, %subreg.sub3 + %4:vgpr_32 = COPY %3.sub2 + S_ENDPGM 0, implicit %4 + +... + +# %2.sub3 -> %1.sub1 +--- +name: reg_sequence_compose_5 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + + ; CHECK-LABEL: name: reg_sequence_compose_5 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + %2:vreg_128 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub0_sub1, %1.sub0_sub1, %subreg.sub2_sub3 + %3:vgpr_32 = COPY %2.sub3 + S_ENDPGM 0, implicit %3 + +... + +# %2.sub0 -> %0.sub2 +--- +name: reg_sequence_compose_6 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + + ; CHECK-LABEL: name: reg_sequence_compose_6 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + %2:vreg_128 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub0_sub1, %1.sub0_sub1, %subreg.sub2_sub3 + %3:vgpr_32 = COPY %2.sub0 + S_ENDPGM 0, implicit %3 + +... + +# %2.sub1 -> %0.sub3 +--- +name: reg_sequence_compose_7 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + + ; CHECK-LABEL: name: reg_sequence_compose_7 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + %2:vreg_128 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub0_sub1, %1.sub0_sub1, %subreg.sub2_sub3 + %3:vgpr_32 = COPY %2.sub1 + S_ENDPGM 0, implicit %3 + +... + +# %2.sub2 -> %1.sub0 +--- +name: reg_sequence_compose_8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + + ; CHECK-LABEL: name: reg_sequence_compose_8 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + %2:vreg_128 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub0_sub1, %1.sub0_sub1, %subreg.sub2_sub3 + %3:vgpr_32 = COPY %2.sub2 + S_ENDPGM 0, implicit %3 + +... + +# %2.sub3 -> %1.sub1 +--- +name: reg_sequence_compose_9 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + + ; CHECK-LABEL: name: reg_sequence_compose_9 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + %2:vreg_128 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub0_sub1, %1.sub0_sub1, %subreg.sub2_sub3 + %3:vgpr_32 = COPY %2.sub3 + S_ENDPGM 0, implicit %3 + +... + +# %2.sub3 -> %1.sub1 +--- +name: reg_sequence_compose_10 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + + ; CHECK-LABEL: name: reg_sequence_compose_10 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]].sub0_sub1, %subreg.sub2_sub3, [[COPY]].sub2_sub3, %subreg.sub0_sub1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + %2:vreg_128 = REG_SEQUENCE %1.sub0_sub1, %subreg.sub2_sub3, %0.sub2_sub3, %subreg.sub0_sub1 + %3:vgpr_32 = COPY %2.sub3 + S_ENDPGM 0, implicit %3 + +... + +# %2.sub1 -> %0.sub3 +--- +name: reg_sequence_compose_11 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + + ; CHECK-LABEL: name: reg_sequence_compose_11 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]].sub0_sub1, %subreg.sub2_sub3, [[COPY]].sub2_sub3, %subreg.sub0_sub1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 + %2:vreg_128 = REG_SEQUENCE %1.sub0_sub1, %subreg.sub2_sub3, %0.sub2_sub3, %subreg.sub0_sub1 %3:vgpr_32 = COPY %2.sub1 S_ENDPGM 0, implicit %3 ... +# %2.sub0 -> %0.sub1 +--- +name: reg_sequence_compose_12 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6 + ; CHECK-LABEL: name: reg_sequence_compose_12 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr5_vgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub1_sub2_sub3, %subreg.sub0_sub1_sub2, [[COPY1]].sub1, %subreg.sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vreg_64 = COPY $vgpr5_vgpr6 + %2:vreg_128 = REG_SEQUENCE %0.sub1_sub2_sub3, %subreg.sub0_sub1_sub2, %1.sub1, %subreg.sub3 + %3:vgpr_32 = COPY %2.sub0 + S_ENDPGM 0, implicit %3 + +... + +# %2.sub0 -> %0.sub1 +# %2.sub1 -> %0.sub2 +# %2.sub2 -> %0.sub3 +# %2.sub3 -> %1.sub1 +--- +name: reg_sequence_compose_13 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6 + + ; CHECK-LABEL: name: reg_sequence_compose_13 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr5_vgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub1_sub2_sub3, %subreg.sub0_sub1_sub2, [[COPY1]].sub1, %subreg.sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]], implicit [[COPY5]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vreg_64 = COPY $vgpr5_vgpr6 + %2:vreg_128 = REG_SEQUENCE %0.sub1_sub2_sub3, %subreg.sub0_sub1_sub2, %1.sub1, %subreg.sub3 + %3:vgpr_32 = COPY %2.sub0 + %4:vgpr_32 = COPY %2.sub1 + %5:vgpr_32 = COPY %2.sub2 + %6:vgpr_32 = COPY %2.sub3 + S_ENDPGM 0, implicit %3, implicit %4, implicit %5, implicit %6 + +... + +# %2.sub0 -> %0.sub1 +# %2.sub1 -> %0.sub2 +# %2.sub2 -> %1.sub0 +# %2.sub3 -> %1.sub1 +# %2.sub4 -> %0.sub2 +# %2.sub5 -> %0.sub3 +--- +name: reg_sequence_compose_14 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6 + + ; CHECK-LABEL: name: reg_sequence_compose_14 + ; CHECK: liveins: $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr5_vgpr6 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub4_sub5, [[COPY]].sub1_sub2, %subreg.sub0_sub1, [[COPY1]], %subreg.sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub4 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub5 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]] + %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 + %1:vreg_64 = COPY $vgpr5_vgpr6 + %2:vreg_192 = REG_SEQUENCE %0.sub2_sub3, %subreg.sub4_sub5, %0.sub1_sub2, %subreg.sub0_sub1, %1, %subreg.sub2_sub3 + %3:vgpr_32 = COPY %2.sub0 + %4:vgpr_32 = COPY %2.sub1 + %5:vgpr_32 = COPY %2.sub2 + %6:vgpr_32 = COPY %2.sub3 + %7:vgpr_32 = COPY %2.sub4 + %8:vgpr_32 = COPY %2.sub5 + S_ENDPGM 0, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8 + +... diff --git a/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir new file mode 100644 index 0000000000000..90291221e8e17 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir @@ -0,0 +1,413 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=peephole-opt -o - %s | FileCheck %s + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_1_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_1_0 + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY2]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY3]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %3:vreg_64_align2 = V_PK_MOV_B32 12, %2, 8, %2, 0, 0, 0, 0, 0, implicit $exec + %4:vgpr_32 = COPY %3.sub1 + %5:vgpr_32 = COPY %3.sub0 + $vgpr4 = COPY %4 + $vgpr5 = COPY %5 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, %4, 8, %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_sgpr_sgpr_1_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr8, $sgpr9 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_sgpr_sgpr_1_0 + ; CHECK: liveins: $sgpr8, $sgpr9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY2]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY3]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:sreg_32 = COPY $sgpr8 + %1:sreg_32 = COPY $sgpr9 + %2:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %3:vreg_64_align2 = V_PK_MOV_B32 12, %2, 8, %2, 0, 0, 0, 0, 0, implicit $exec + %4:vgpr_32 = COPY %3.sub1 + %5:vgpr_32 = COPY %3.sub0 + $vgpr4 = COPY %4 + $vgpr5 = COPY %5 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_sgpr_3_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr8, $vgpr9, $sgpr10, $sgpr11 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_sgpr_3_0 + ; CHECK: liveins: $vgpr8, $vgpr9, $sgpr10, $sgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr10 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $sgpr11 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr8 + %1:vgpr_32 = COPY $vgpr9 + %2:sreg_64 = COPY $sgpr10 + %3:sreg_64 = COPY $sgpr11 + %4:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:sreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, %4, 8, %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_sgpr_vgpr_3_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr8, $sgpr9, $vgpr10, $vgpr11 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_sgpr_vgpr_3_0 + ; CHECK: liveins: $sgpr8, $sgpr9, $vgpr10, $vgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:sreg_32 = COPY $sgpr8 + %1:sreg_32 = COPY $sgpr9 + %2:vgpr_32 = COPY $vgpr10 + %3:vgpr_32 = COPY $vgpr11 + %4:sreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, %4, 8, %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_lhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_lhs + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, undef [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, undef %4, 8, %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_rhs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_rhs + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, undef [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, %4, 8, undef %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_undef +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_undef_undef + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, undef [[REG_SEQUENCE]], 8, undef [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + %5:vreg_64_align2 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vreg_64_align2 = V_PK_MOV_B32 12, undef %4, 8, undef %5, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = COPY %6.sub1 + %8:vgpr_32 = COPY %6.sub0 + $vgpr4 = COPY %7 + $vgpr5 = COPY %8 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_compose_src_subregs_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_3_0_compose_src_subregs_0 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]].sub0_sub1, 8, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY6]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY7]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vgpr_32 = COPY $vgpr4 + %5:vgpr_32 = COPY $vgpr5 + %6:vreg_96_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2 + %7:vreg_96_align2 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1, %5, %subreg.sub2 + %8:vreg_64_align2 = V_PK_MOV_B32 12, %6.sub0_sub1, 8, %7.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + %9:vgpr_32 = COPY %8.sub1 + %10:vgpr_32 = COPY %8.sub0 + $vgpr4 = COPY %9 + $vgpr5 = COPY %10 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_0_3_compose_src_subregs_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_0_3_compose_src_subregs_0 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, [[REG_SEQUENCE]].sub0_sub1, 12, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY6]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY7]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vgpr_32 = COPY $vgpr4 + %5:vgpr_32 = COPY $vgpr5 + %6:vreg_96_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2 + %7:vreg_96_align2 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1, %5, %subreg.sub2 + %8:vreg_64_align2 = V_PK_MOV_B32 8, %6.sub0_sub1, 12, %7.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + %9:vgpr_32 = COPY %8.sub1 + %10:vgpr_32 = COPY %8.sub0 + $vgpr4 = COPY %9 + $vgpr5 = COPY %10 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... + +--- +name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_compose_src_subregs_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + + ; CHECK-LABEL: name: v_pk_mov_b32__reg_sequence_shuffle_vgpr_vgpr_compose_src_subregs_1 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]].sub2_sub3, 12, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: $vgpr4 = COPY [[COPY8]] + ; CHECK-NEXT: $vgpr5 = COPY [[COPY9]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vgpr_32 = COPY $vgpr4 + %5:vgpr_32 = COPY $vgpr5 + %6:vgpr_32 = COPY $vgpr6 + %7:vgpr_32 = COPY $vgpr7 + %8:vreg_128_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 + %9:vreg_128_align2 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1, %6, %subreg.sub2, %7, %subreg.sub3 + %10:vreg_64_align2 = V_PK_MOV_B32 12, %8.sub2_sub3, 12, %9.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + %11:vgpr_32 = COPY %10.sub1 + %12:vgpr_32 = COPY %10.sub0 + $vgpr4 = COPY %11 + $vgpr5 = COPY %12 + S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 + +... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-dbg.mir b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-dbg.mir index 85d0c054754d0..ede043ce73a47 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-dbg.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-dbg.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 # RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-enable-rewrite-partial-reg-uses=true -verify-machineinstrs -start-before=rename-independent-subregs -stop-after=rewrite-partial-reg-uses %s -o - | FileCheck -check-prefix=CHECK %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -passes="rename-independent-subregs,amdgpu-rewrite-partial-reg-uses" %s -o - | FileCheck -check-prefix=CHECK %s --- | define void @test_vreg_96_w64() !dbg !5 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-gen.mir b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-gen.mir index 037f39df8c3e0..79e9ce2737695 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-gen.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses-gen.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-enable-rewrite-partial-reg-uses=true -verify-machineinstrs -start-before=rename-independent-subregs -stop-after=rewrite-partial-reg-uses %s -o - | FileCheck -check-prefix=CHECK %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -passes="rename-independent-subregs,amdgpu-rewrite-partial-reg-uses" %s -o - | FileCheck -check-prefix=CHECK %s --- name: test_subregs_composition_vreg_1024 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses.mir b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses.mir index 07e49dcdafd8c..33007ee8a7c38 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-partial-reg-uses.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-enable-rewrite-partial-reg-uses=true -verify-machineinstrs -start-before=rename-independent-subregs -stop-after=rewrite-partial-reg-uses %s -o - | FileCheck -check-prefix=CHECK %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -passes="rename-independent-subregs,amdgpu-rewrite-partial-reg-uses" %s -o - | FileCheck -check-prefix=CHECK %s --- name: test_subregs_composition_vreg_1024 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index c933fb0de5864..c90975959c3f4 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=machine-scheduler -verify-misched -o - %s | FileCheck %s # This would assert that a dead def should have no uses, but the dead # def and use have different subreg indices. diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir index add7825a224ed..2cd78062ccbd7 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=machine-scheduler %s -o - | FileCheck %s # The sequence of DBG_VALUEs forms a scheduling region with 0 real # instructions. The RegPressure tracker would end up skipping over any diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-hang-weak-dep.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-hang-weak-dep.mir index 3fdb0c7c0885b..f797b01d49bf8 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-barrier-hang-weak-dep.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-hang-weak-dep.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -verify-misched -o - %s | FileCheck %s # This would hang after removing edges from the SCHED_BARRIER since the number # of Preds/Succs would be left in an inconsistent state. diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index 09037709d51d8..3254f5e45e4f4 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=machine-scheduler -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -passes=machine-scheduler -o - %s | FileCheck %s --- | %struct.widget.0 = type { float, i32, i32 } diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir index 6796391aba675..3ca61d26e8e42 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -verify-misched -run-pass=machine-scheduler -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-misched -passes=machine-scheduler -o - %s | FileCheck %s --- name: handleMoveUp_incorrect_interval diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir index 0b1fd441256d8..099cfc4f1dd54 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=machine-scheduler -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -passes=machine-scheduler -o - %s | FileCheck %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=machine-scheduler -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=machine-scheduler -o - %s | FileCheck %s # Make sure FP mode is not a hard scheduling boundary --- diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir index e67036f0bbbea..88e11c9ce3d1d 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=machine-scheduler -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=machine-scheduler -o - %s | FileCheck %s --- # Check that the high latency loads are both scheduled first, before the diff --git a/llvm/test/CodeGen/AMDGPU/sreg-xnull-regclass-bitwidth.mir b/llvm/test/CodeGen/AMDGPU/sreg-xnull-regclass-bitwidth.mir index d8d4f5d0220c9..3091fe85fa8bc 100644 --- a/llvm/test/CodeGen/AMDGPU/sreg-xnull-regclass-bitwidth.mir +++ b/llvm/test/CodeGen/AMDGPU/sreg-xnull-regclass-bitwidth.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=postmisched -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=postmisched -o - %s | FileCheck %s --- name: test_xnull_256 body: | diff --git a/llvm/test/CodeGen/ARM/cortex-m7-wideops.mir b/llvm/test/CodeGen/ARM/cortex-m7-wideops.mir index 0a47b87b422dd..1bee32f4c90cd 100644 --- a/llvm/test/CodeGen/ARM/cortex-m7-wideops.mir +++ b/llvm/test/CodeGen/ARM/cortex-m7-wideops.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple arm-arm-eabi -mcpu=cortex-m7 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck %s +# RUN: llc -mtriple arm-arm-eabi -mcpu=cortex-m7 -passes=postmisched %s -o - | FileCheck %s --- name: test_groups alignment: 2 diff --git a/llvm/test/CodeGen/ARM/misched-branch-targets.mir b/llvm/test/CodeGen/ARM/misched-branch-targets.mir index d828d9e516273..610344f844001 100644 --- a/llvm/test/CodeGen/ARM/misched-branch-targets.mir +++ b/llvm/test/CodeGen/ARM/misched-branch-targets.mir @@ -1,5 +1,7 @@ # RUN: llc -o - -run-pass=machine-scheduler -misched=shuffle %s | FileCheck %s +# RUN: llc -o - -passes=machine-scheduler -misched=shuffle %s | FileCheck %s # RUN: llc -o - -run-pass=postmisched %s | FileCheck %s +# RUN: llc -o - -passes=postmisched %s | FileCheck %s # REQUIRES: asserts # -misched=shuffle is only available with assertions enabled diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/disable-opt-cs.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/disable-opt-cs.ll new file mode 100644 index 0000000000000..421c8b67350c2 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/disable-opt-cs.ll @@ -0,0 +1,34 @@ +; RUN: opt -S --passes="print-dx-shader-flags" 2>&1 %s | FileCheck %s + + +; CHECK: ; Combined Shader Flags for Module +; CHECK-NEXT: ; Shader Flags Value: 0x00000001 + +; CHECK: ; Note: extra DXIL module flags: +; CHECK-NEXT: ; D3D11_1_SB_GLOBAL_FLAG_SKIP_OPTIMIZATION + +; CHECK: ; Shader Flags for Module Functions +; CHECK: ; Function main : 0x00000000 +; The test source in this file generated from the following command: +; clang -cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -O0 -o - <&1 %s | FileCheck %s + + +; CHECK: ; Combined Shader Flags for Module +; CHECK-NEXT: ; Shader Flags Value: 0x00000001 + +; CHECK: ; Note: extra DXIL module flags: +; CHECK-NEXT: ; D3D11_1_SB_GLOBAL_FLAG_SKIP_OPTIMIZATION + +; CHECK: ; Shader Flags for Module Functions +; CHECK: ; Function main : 0x00000000 +; The test source in this file generated from the following command: +; clang -cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -O0 -o - <&1 %s | FileCheck %s + +target triple = "dxilv1.3-pc-shadermodel6.3-library" + +; All entry functions of a library shader need to either have optnone +; or not have the attribute +; CHECK: error: +; CHECK-SAME: in function entry_two +; CHECK-SAME: Inconsistent optnone attribute +; Function Attrs: convergent noinline norecurse optnone +define void @entry_one() #0 { +entry: + ret void +} + +; Function Attrs: convergent noinline norecurse +define void @entry_two() #1 { +entry: + ret void +} + +attributes #0 = { convergent noinline norecurse optnone "approx-func-fp-math"="true" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #1 = { convergent noinline norecurse "approx-func-fp-math"="true" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + +!llvm.module.flags = !{!0, !1} +!dx.valver = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 4, !"dx.disable_optimizations", i32 1} +!2 = !{i32 1, i32 8} diff --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll index b071557249414..03b2150bbc1dc 100644 --- a/llvm/test/CodeGen/DirectX/llc-pipeline.ll +++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll @@ -23,8 +23,8 @@ ; CHECK-NEXT: Scalarize vector operations ; CHECK-NEXT: DXIL Resource Binding Analysis ; CHECK-NEXT: DXIL resource Information -; CHECK-NEXT: DXIL Shader Flag Analysis ; CHECK-NEXT: DXIL Module Metadata analysis +; CHECK-NEXT: DXIL Shader Flag Analysis ; CHECK-NEXT: DXIL Translate Metadata ; CHECK-NEXT: DXIL Op Lowering ; CHECK-NEXT: DXIL Prepare Module diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll index 23428b3728674..0aa66d1fc45f3 100644 --- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll @@ -1,15 +1,15 @@ -; RUN: llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32,G32 -; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64,G64 -; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr| FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64,G64 +; RUN: llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32 +; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64 +; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64 ; RUN: %if ptxas && !ptxas-12.0 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %} ; ALL-LABEL: conv1 define i32 @conv1(ptr addrspace(1) %ptr) { -; G32: cvta.global.u32 +; CLS32: cvta.global.u32 ; ALL-NOT: cvt.u64.u32 -; G64: cvta.global.u64 +; CLS64: cvta.global.u64 ; ALL: ld.u32 %genptr = addrspacecast ptr addrspace(1) %ptr to ptr %val = load i32, ptr %genptr @@ -99,6 +99,17 @@ define i32 @conv8(ptr %ptr) { ret i32 %val } +; ALL-LABEL: conv9 +define i32 @conv9(ptr addrspace(1) %ptr) { +; CLS32: // implicit-def: %[[ADDR:r[0-9]+]] +; PTRCONV: // implicit-def: %[[ADDR:r[0-9]+]] +; NOPTRCONV: // implicit-def: %[[ADDR:rd[0-9]+]] +; ALL: ld.shared.u32 %r{{[0-9]+}}, [%[[ADDR]]] + %specptr = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(3) + %val = load i32, ptr addrspace(3) %specptr + ret i32 %val +} + ; Check that we support addrspacecast when splitting the vector ; result (<2 x ptr> => 2 x <1 x ptr>). ; This also checks that scalarization works for addrspacecast diff --git a/llvm/test/CodeGen/NVPTX/annotations.ll b/llvm/test/CodeGen/NVPTX/annotations.ll index 3bd534bb0cf5d..1f888d7fb21f1 100644 --- a/llvm/test/CodeGen/NVPTX/annotations.ll +++ b/llvm/test/CodeGen/NVPTX/annotations.ll @@ -23,20 +23,20 @@ define void @kernel_func_reqntid(ptr %a) { } ; CHECK: .entry kernel_func_minctasm -define void @kernel_func_minctasm(ptr %a) { +define ptx_kernel void @kernel_func_minctasm(ptr %a) "nvvm.minctasm"="42" { ; CHECK: .minnctapersm 42 ; CHECK: ret ret void } ; CHECK-LABEL: .entry kernel_func_maxnreg -define void @kernel_func_maxnreg() { +define ptx_kernel void @kernel_func_maxnreg() "nvvm.maxnreg"="1234" { ; CHECK: .maxnreg 1234 ; CHECK: ret ret void } -!nvvm.annotations = !{!1, !2, !3, !4, !5, !6, !7, !8, !9, !10} +!nvvm.annotations = !{!1, !2, !3, !4, !9, !10} !1 = !{ptr @kernel_func_maxntid, !"kernel", i32 1} !2 = !{ptr @kernel_func_maxntid, !"maxntidx", i32 10, !"maxntidy", i32 20, !"maxntidz", i32 30} @@ -44,11 +44,5 @@ define void @kernel_func_maxnreg() { !3 = !{ptr @kernel_func_reqntid, !"kernel", i32 1} !4 = !{ptr @kernel_func_reqntid, !"reqntidx", i32 11, !"reqntidy", i32 22, !"reqntidz", i32 33} -!5 = !{ptr @kernel_func_minctasm, !"kernel", i32 1} -!6 = !{ptr @kernel_func_minctasm, !"minctasm", i32 42} - -!7 = !{ptr @kernel_func_maxnreg, !"kernel", i32 1} -!8 = !{ptr @kernel_func_maxnreg, !"maxnreg", i32 1234} - !9 = !{ptr addrspace(1) @texture, !"texture", i32 1} !10 = !{ptr addrspace(1) @surface, !"surface", i32 1} diff --git a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll index 60b3d70840af5..3b73c36de4b89 100644 --- a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll +++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll @@ -43,7 +43,8 @@ define internal void @bar() { ret void } -; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$init"() { +; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$init" +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_start, align 8 ; CHECK-NEXT: [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_end, align 8 @@ -60,7 +61,8 @@ define internal void @bar() { ; CHECK-NEXT: ret void ; ; -; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$fini"() { +; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$fini" +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_start, align 8 ; CHECK-NEXT: [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_end, align 8 @@ -82,11 +84,11 @@ define internal void @bar() { ; CHECK: while.end: ; CHECK-NEXT: ret void +; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" } + ; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1} ; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1} ; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1} -; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxclusterrank", i32 1} -; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1} -; CHECK: [[META7:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1} -; CHECK: [[META8:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1} -; CHECK: [[META9:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxclusterrank", i32 1} +; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1} +; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1} +; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1} diff --git a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll index c445c34c1842a..51483296dd34f 100644 --- a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll +++ b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll @@ -10,16 +10,14 @@ target triple = "nvptx64-unknown-unknown" ; CHECK_SM_80-NOT: .maxclusterrank 8 ; Make sure that for SM version prior to 90 `.maxclusterrank` directive is -; sielently ignored. -define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() { +; silently ignored. +define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" { entry: %a = alloca i32, align 4 store volatile i32 1, ptr %a, align 4 ret void } -!nvvm.annotations = !{!1, !2, !3} +!nvvm.annotations = !{!1} !1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128} -!2 = !{ptr @_Z18TestMaxClusterRankv, !"minctasm", i32 2} -!3 = !{ptr @_Z18TestMaxClusterRankv, !"maxclusterrank", i32 8} diff --git a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll index a9f370a12a945..3a1f59454493c 100644 --- a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll +++ b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll @@ -1,28 +1,68 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 -; RUN: opt < %s -mtriple=nvptx64-unknown-unknown -O0 -S | FileCheck %s +; RUN: opt < %s -passes=verify -S | FileCheck %s -define i32 @foo(i32 %a, i32 %b) { -; CHECK-LABEL: define i32 @foo( +define i32 @test_align(i32 %a, i32 %b) { +; CHECK-LABEL: define i32 @test_align( ; CHECK-SAME: i32 alignstack(8) [[A:%.*]], i32 alignstack(16) [[B:%.*]]) { ; CHECK-NEXT: ret i32 0 ; ret i32 0 } -define i32 @bar(i32 %a, i32 %b) { -; CHECK-LABEL: define ptx_kernel i32 @bar( -; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) { -; CHECK-NEXT: ret i32 0 +define void @test_kernel() { +; CHECK-LABEL: define ptx_kernel void @test_kernel() { +; CHECK-NEXT: ret void ; - ret i32 0 + ret void +} + +define void @test_maxclusterrank() { +; CHECK-LABEL: define void @test_maxclusterrank( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void } -!nvvm.annotations = !{!0, !1, !2} +define void @test_cluster_max_blocks() { +; CHECK-LABEL: define void @test_cluster_max_blocks( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} -!0 = !{ptr @foo, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010} +define void @test_minctasm() { +; CHECK-LABEL: define void @test_minctasm( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_maxnreg() { +; CHECK-LABEL: define void @test_maxnreg( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6} + +!0 = !{ptr @test_align, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010} !1 = !{null, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020008} -!2 = !{ptr @bar, !"kernel", i32 1} +!2 = !{ptr @test_kernel, !"kernel", i32 1} +!3 = !{ptr @test_maxclusterrank, !"maxclusterrank", i32 2} +!4 = !{ptr @test_cluster_max_blocks, !"cluster_max_blocks", i32 3} +!5 = !{ptr @test_minctasm, !"minctasm", i32 4} +!6 = !{ptr @test_maxnreg, !"maxnreg", i32 5} ;. -; CHECK: [[META0:![0-9]+]] = !{ptr @foo, !"align", i32 8} +; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="2" } +; CHECK: attributes #[[ATTR1]] = { "nvvm.maxclusterrank"="3" } +; CHECK: attributes #[[ATTR2]] = { "nvvm.minctasm"="4" } +; CHECK: attributes #[[ATTR3]] = { "nvvm.maxnreg"="5" } +;. +; CHECK: [[META0:![0-9]+]] = !{ptr @test_align, !"align", i32 8} ;. diff --git a/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir b/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir index 627e553475480..8bdbe288d98e6 100644 --- a/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir +++ b/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=postmisched -o - %s | FileCheck %s +# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -passes=postmisched -o - %s | FileCheck %s --- # Check that postmisched's TopDepthReduce heuristic moves the MULLD later # because of the dependency on x5 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll b/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll index 0fd23a7d346df..1b96189aaea5c 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll @@ -212,30 +212,30 @@ define i64 @add64_accept(i64 %a) nounwind { define void @add32_reject() nounwind { ; RV32I-LABEL: add32_reject: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a0, %hi(ga) -; RV32I-NEXT: lui a1, %hi(gb) -; RV32I-NEXT: lw a2, %lo(ga)(a0) -; RV32I-NEXT: lw a3, %lo(gb)(a1) -; RV32I-NEXT: lui a4, 1 -; RV32I-NEXT: addi a4, a4, -1096 -; RV32I-NEXT: add a2, a2, a4 -; RV32I-NEXT: add a3, a3, a4 -; RV32I-NEXT: sw a2, %lo(ga)(a0) -; RV32I-NEXT: sw a3, %lo(gb)(a1) +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: lui a1, %hi(ga) +; RV32I-NEXT: lui a2, %hi(gb) +; RV32I-NEXT: lw a3, %lo(ga)(a1) +; RV32I-NEXT: lw a4, %lo(gb)(a2) +; RV32I-NEXT: addi a0, a0, -1096 +; RV32I-NEXT: add a3, a3, a0 +; RV32I-NEXT: add a0, a4, a0 +; RV32I-NEXT: sw a3, %lo(ga)(a1) +; RV32I-NEXT: sw a0, %lo(gb)(a2) ; RV32I-NEXT: ret ; ; RV64I-LABEL: add32_reject: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a0, %hi(ga) -; RV64I-NEXT: lui a1, %hi(gb) -; RV64I-NEXT: lw a2, %lo(ga)(a0) -; RV64I-NEXT: lw a3, %lo(gb)(a1) -; RV64I-NEXT: lui a4, 1 -; RV64I-NEXT: addi a4, a4, -1096 -; RV64I-NEXT: add a2, a2, a4 -; RV64I-NEXT: add a3, a3, a4 -; RV64I-NEXT: sw a2, %lo(ga)(a0) -; RV64I-NEXT: sw a3, %lo(gb)(a1) +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: lui a1, %hi(ga) +; RV64I-NEXT: lui a2, %hi(gb) +; RV64I-NEXT: lw a3, %lo(ga)(a1) +; RV64I-NEXT: lw a4, %lo(gb)(a2) +; RV64I-NEXT: addi a0, a0, -1096 +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: add a0, a4, a0 +; RV64I-NEXT: sw a3, %lo(ga)(a1) +; RV64I-NEXT: sw a0, %lo(gb)(a2) ; RV64I-NEXT: ret %1 = load i32, ptr @ga, align 4 %2 = load i32, ptr @gb, align 4 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll index 3a55189076dee..5b9f0e60e7d80 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll @@ -93,49 +93,49 @@ define i32 @expanded_neg_abs32_unsigned(i32 %x) { define i64 @expanded_neg_abs64(i64 %x) { ; RV32I-LABEL: expanded_neg_abs64: ; RV32I: # %bb.0: -; RV32I-NEXT: snez a2, a0 -; RV32I-NEXT: neg a3, a1 -; RV32I-NEXT: sub a2, a3, a2 -; RV32I-NEXT: neg a3, a0 -; RV32I-NEXT: beq a2, a1, .LBB2_2 +; RV32I-NEXT: neg a2, a0 +; RV32I-NEXT: snez a3, a0 +; RV32I-NEXT: neg a4, a1 +; RV32I-NEXT: sub a3, a4, a3 +; RV32I-NEXT: beq a3, a1, .LBB2_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt a4, a1, a2 +; RV32I-NEXT: slt a4, a1, a3 ; RV32I-NEXT: beqz a4, .LBB2_3 ; RV32I-NEXT: j .LBB2_4 ; RV32I-NEXT: .LBB2_2: -; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: sltu a4, a0, a2 ; RV32I-NEXT: bnez a4, .LBB2_4 ; RV32I-NEXT: .LBB2_3: -; RV32I-NEXT: mv a3, a0 -; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: .LBB2_4: -; RV32I-NEXT: neg a0, a3 -; RV32I-NEXT: snez a1, a3 -; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: neg a0, a2 +; RV32I-NEXT: snez a1, a2 +; RV32I-NEXT: neg a2, a3 ; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: expanded_neg_abs64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: snez a2, a0 -; RV32ZBB-NEXT: neg a3, a1 -; RV32ZBB-NEXT: sub a2, a3, a2 -; RV32ZBB-NEXT: neg a3, a0 -; RV32ZBB-NEXT: beq a2, a1, .LBB2_2 +; RV32ZBB-NEXT: neg a2, a0 +; RV32ZBB-NEXT: snez a3, a0 +; RV32ZBB-NEXT: neg a4, a1 +; RV32ZBB-NEXT: sub a3, a4, a3 +; RV32ZBB-NEXT: beq a3, a1, .LBB2_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt a4, a1, a2 +; RV32ZBB-NEXT: slt a4, a1, a3 ; RV32ZBB-NEXT: beqz a4, .LBB2_3 ; RV32ZBB-NEXT: j .LBB2_4 ; RV32ZBB-NEXT: .LBB2_2: -; RV32ZBB-NEXT: sltu a4, a0, a3 +; RV32ZBB-NEXT: sltu a4, a0, a2 ; RV32ZBB-NEXT: bnez a4, .LBB2_4 ; RV32ZBB-NEXT: .LBB2_3: -; RV32ZBB-NEXT: mv a3, a0 -; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a2, a0 +; RV32ZBB-NEXT: mv a3, a1 ; RV32ZBB-NEXT: .LBB2_4: -; RV32ZBB-NEXT: neg a0, a3 -; RV32ZBB-NEXT: snez a1, a3 -; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: neg a0, a2 +; RV32ZBB-NEXT: snez a1, a2 +; RV32ZBB-NEXT: neg a2, a3 ; RV32ZBB-NEXT: sub a1, a2, a1 ; RV32ZBB-NEXT: ret ; @@ -163,49 +163,49 @@ define i64 @expanded_neg_abs64(i64 %x) { define i64 @expanded_neg_abs64_unsigned(i64 %x) { ; RV32I-LABEL: expanded_neg_abs64_unsigned: ; RV32I: # %bb.0: -; RV32I-NEXT: snez a2, a0 -; RV32I-NEXT: neg a3, a1 -; RV32I-NEXT: sub a2, a3, a2 -; RV32I-NEXT: neg a3, a0 -; RV32I-NEXT: beq a2, a1, .LBB3_2 +; RV32I-NEXT: neg a2, a0 +; RV32I-NEXT: snez a3, a0 +; RV32I-NEXT: neg a4, a1 +; RV32I-NEXT: sub a3, a4, a3 +; RV32I-NEXT: beq a3, a1, .LBB3_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu a4, a1, a2 +; RV32I-NEXT: sltu a4, a1, a3 ; RV32I-NEXT: beqz a4, .LBB3_3 ; RV32I-NEXT: j .LBB3_4 ; RV32I-NEXT: .LBB3_2: -; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: sltu a4, a0, a2 ; RV32I-NEXT: bnez a4, .LBB3_4 ; RV32I-NEXT: .LBB3_3: -; RV32I-NEXT: mv a3, a0 -; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: .LBB3_4: -; RV32I-NEXT: neg a0, a3 -; RV32I-NEXT: snez a1, a3 -; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: neg a0, a2 +; RV32I-NEXT: snez a1, a2 +; RV32I-NEXT: neg a2, a3 ; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: expanded_neg_abs64_unsigned: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: snez a2, a0 -; RV32ZBB-NEXT: neg a3, a1 -; RV32ZBB-NEXT: sub a2, a3, a2 -; RV32ZBB-NEXT: neg a3, a0 -; RV32ZBB-NEXT: beq a2, a1, .LBB3_2 +; RV32ZBB-NEXT: neg a2, a0 +; RV32ZBB-NEXT: snez a3, a0 +; RV32ZBB-NEXT: neg a4, a1 +; RV32ZBB-NEXT: sub a3, a4, a3 +; RV32ZBB-NEXT: beq a3, a1, .LBB3_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu a4, a1, a2 +; RV32ZBB-NEXT: sltu a4, a1, a3 ; RV32ZBB-NEXT: beqz a4, .LBB3_3 ; RV32ZBB-NEXT: j .LBB3_4 ; RV32ZBB-NEXT: .LBB3_2: -; RV32ZBB-NEXT: sltu a4, a0, a3 +; RV32ZBB-NEXT: sltu a4, a0, a2 ; RV32ZBB-NEXT: bnez a4, .LBB3_4 ; RV32ZBB-NEXT: .LBB3_3: -; RV32ZBB-NEXT: mv a3, a0 -; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a2, a0 +; RV32ZBB-NEXT: mv a3, a1 ; RV32ZBB-NEXT: .LBB3_4: -; RV32ZBB-NEXT: neg a0, a3 -; RV32ZBB-NEXT: snez a1, a3 -; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: neg a0, a2 +; RV32ZBB-NEXT: snez a1, a2 +; RV32ZBB-NEXT: neg a2, a3 ; RV32ZBB-NEXT: sub a1, a2, a1 ; RV32ZBB-NEXT: ret ; @@ -315,49 +315,49 @@ define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) { define i64 @expanded_neg_inv_abs64(i64 %x) { ; RV32I-LABEL: expanded_neg_inv_abs64: ; RV32I: # %bb.0: -; RV32I-NEXT: snez a2, a0 -; RV32I-NEXT: neg a3, a1 -; RV32I-NEXT: sub a2, a3, a2 -; RV32I-NEXT: neg a3, a0 -; RV32I-NEXT: beq a2, a1, .LBB6_2 +; RV32I-NEXT: neg a2, a0 +; RV32I-NEXT: snez a3, a0 +; RV32I-NEXT: neg a4, a1 +; RV32I-NEXT: sub a3, a4, a3 +; RV32I-NEXT: beq a3, a1, .LBB6_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt a4, a2, a1 +; RV32I-NEXT: slt a4, a3, a1 ; RV32I-NEXT: beqz a4, .LBB6_3 ; RV32I-NEXT: j .LBB6_4 ; RV32I-NEXT: .LBB6_2: -; RV32I-NEXT: sltu a4, a3, a0 +; RV32I-NEXT: sltu a4, a2, a0 ; RV32I-NEXT: bnez a4, .LBB6_4 ; RV32I-NEXT: .LBB6_3: -; RV32I-NEXT: mv a3, a0 -; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: .LBB6_4: -; RV32I-NEXT: neg a0, a3 -; RV32I-NEXT: snez a1, a3 -; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: neg a0, a2 +; RV32I-NEXT: snez a1, a2 +; RV32I-NEXT: neg a2, a3 ; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: expanded_neg_inv_abs64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: snez a2, a0 -; RV32ZBB-NEXT: neg a3, a1 -; RV32ZBB-NEXT: sub a2, a3, a2 -; RV32ZBB-NEXT: neg a3, a0 -; RV32ZBB-NEXT: beq a2, a1, .LBB6_2 +; RV32ZBB-NEXT: neg a2, a0 +; RV32ZBB-NEXT: snez a3, a0 +; RV32ZBB-NEXT: neg a4, a1 +; RV32ZBB-NEXT: sub a3, a4, a3 +; RV32ZBB-NEXT: beq a3, a1, .LBB6_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt a4, a2, a1 +; RV32ZBB-NEXT: slt a4, a3, a1 ; RV32ZBB-NEXT: beqz a4, .LBB6_3 ; RV32ZBB-NEXT: j .LBB6_4 ; RV32ZBB-NEXT: .LBB6_2: -; RV32ZBB-NEXT: sltu a4, a3, a0 +; RV32ZBB-NEXT: sltu a4, a2, a0 ; RV32ZBB-NEXT: bnez a4, .LBB6_4 ; RV32ZBB-NEXT: .LBB6_3: -; RV32ZBB-NEXT: mv a3, a0 -; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a2, a0 +; RV32ZBB-NEXT: mv a3, a1 ; RV32ZBB-NEXT: .LBB6_4: -; RV32ZBB-NEXT: neg a0, a3 -; RV32ZBB-NEXT: snez a1, a3 -; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: neg a0, a2 +; RV32ZBB-NEXT: snez a1, a2 +; RV32ZBB-NEXT: neg a2, a3 ; RV32ZBB-NEXT: sub a1, a2, a1 ; RV32ZBB-NEXT: ret ; @@ -385,49 +385,49 @@ define i64 @expanded_neg_inv_abs64(i64 %x) { define i64 @expanded_neg_inv_abs64_unsigned(i64 %x) { ; RV32I-LABEL: expanded_neg_inv_abs64_unsigned: ; RV32I: # %bb.0: -; RV32I-NEXT: snez a2, a0 -; RV32I-NEXT: neg a3, a1 -; RV32I-NEXT: sub a2, a3, a2 -; RV32I-NEXT: neg a3, a0 -; RV32I-NEXT: beq a2, a1, .LBB7_2 +; RV32I-NEXT: neg a2, a0 +; RV32I-NEXT: snez a3, a0 +; RV32I-NEXT: neg a4, a1 +; RV32I-NEXT: sub a3, a4, a3 +; RV32I-NEXT: beq a3, a1, .LBB7_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu a4, a2, a1 +; RV32I-NEXT: sltu a4, a3, a1 ; RV32I-NEXT: beqz a4, .LBB7_3 ; RV32I-NEXT: j .LBB7_4 ; RV32I-NEXT: .LBB7_2: -; RV32I-NEXT: sltu a4, a3, a0 +; RV32I-NEXT: sltu a4, a2, a0 ; RV32I-NEXT: bnez a4, .LBB7_4 ; RV32I-NEXT: .LBB7_3: -; RV32I-NEXT: mv a3, a0 -; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: .LBB7_4: -; RV32I-NEXT: neg a0, a3 -; RV32I-NEXT: snez a1, a3 -; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: neg a0, a2 +; RV32I-NEXT: snez a1, a2 +; RV32I-NEXT: neg a2, a3 ; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: expanded_neg_inv_abs64_unsigned: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: snez a2, a0 -; RV32ZBB-NEXT: neg a3, a1 -; RV32ZBB-NEXT: sub a2, a3, a2 -; RV32ZBB-NEXT: neg a3, a0 -; RV32ZBB-NEXT: beq a2, a1, .LBB7_2 +; RV32ZBB-NEXT: neg a2, a0 +; RV32ZBB-NEXT: snez a3, a0 +; RV32ZBB-NEXT: neg a4, a1 +; RV32ZBB-NEXT: sub a3, a4, a3 +; RV32ZBB-NEXT: beq a3, a1, .LBB7_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu a4, a2, a1 +; RV32ZBB-NEXT: sltu a4, a3, a1 ; RV32ZBB-NEXT: beqz a4, .LBB7_3 ; RV32ZBB-NEXT: j .LBB7_4 ; RV32ZBB-NEXT: .LBB7_2: -; RV32ZBB-NEXT: sltu a4, a3, a0 +; RV32ZBB-NEXT: sltu a4, a2, a0 ; RV32ZBB-NEXT: bnez a4, .LBB7_4 ; RV32ZBB-NEXT: .LBB7_3: -; RV32ZBB-NEXT: mv a3, a0 -; RV32ZBB-NEXT: mv a2, a1 +; RV32ZBB-NEXT: mv a2, a0 +; RV32ZBB-NEXT: mv a3, a1 ; RV32ZBB-NEXT: .LBB7_4: -; RV32ZBB-NEXT: neg a0, a3 -; RV32ZBB-NEXT: snez a1, a3 -; RV32ZBB-NEXT: neg a2, a2 +; RV32ZBB-NEXT: neg a0, a2 +; RV32ZBB-NEXT: snez a1, a2 +; RV32ZBB-NEXT: neg a2, a3 ; RV32ZBB-NEXT: sub a1, a2, a1 ; RV32ZBB-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll index cb2037f5fb027..28dde9a3472c2 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-arith.ll @@ -424,11 +424,11 @@ define double @fmsub_d(double %a, double %b, double %c) nounwind { ; RV32I-NEXT: mv s2, a2 ; RV32I-NEXT: mv s3, a3 ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: lui a1, %hi(.LCPI12_0) -; RV32I-NEXT: addi a1, a1, %lo(.LCPI12_0) -; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a3, 4(a1) ; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: lui a2, %hi(.LCPI12_0) +; RV32I-NEXT: addi a3, a2, %lo(.LCPI12_0) +; RV32I-NEXT: lw a2, 0(a3) +; RV32I-NEXT: lw a3, 4(a3) ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv a4, a0 ; RV32I-NEXT: lui a5, 524288 @@ -454,9 +454,9 @@ define double @fmsub_d(double %a, double %b, double %c) nounwind { ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: lui a0, %hi(.LCPI12_0) -; RV64I-NEXT: ld a1, %lo(.LCPI12_0)(a0) ; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: lui a1, %hi(.LCPI12_0) +; RV64I-NEXT: ld a1, %lo(.LCPI12_0)(a1) ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: li a1, -1 ; RV64I-NEXT: slli a1, a1, 63 @@ -511,20 +511,20 @@ define double @fnmadd_d(double %a, double %b, double %c) nounwind { ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a3 ; RV32I-NEXT: mv s2, a4 +; RV32I-NEXT: mv s3, a5 ; RV32I-NEXT: lui a2, %hi(.LCPI13_0) ; RV32I-NEXT: addi a2, a2, %lo(.LCPI13_0) -; RV32I-NEXT: lw s3, 0(a2) -; RV32I-NEXT: lw s4, 4(a2) -; RV32I-NEXT: mv s5, a5 -; RV32I-NEXT: mv a2, s3 -; RV32I-NEXT: mv a3, s4 +; RV32I-NEXT: lw s4, 0(a2) +; RV32I-NEXT: lw s5, 4(a2) +; RV32I-NEXT: mv a2, s4 +; RV32I-NEXT: mv a3, s5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv s6, a0 ; RV32I-NEXT: mv s7, a1 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s5 -; RV32I-NEXT: mv a2, s3 -; RV32I-NEXT: mv a3, s4 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a2, s4 +; RV32I-NEXT: mv a3, s5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv a4, a0 ; RV32I-NEXT: lui a5, 524288 @@ -556,14 +556,14 @@ define double @fnmadd_d(double %a, double %b, double %c) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI13_0) -; RV64I-NEXT: ld s1, %lo(.LCPI13_0)(a1) -; RV64I-NEXT: mv s2, a2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: ld s2, %lo(.LCPI13_0)(a1) +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: mv s3, a0 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: li a1, -1 ; RV64I-NEXT: slli a2, a1, 63 @@ -625,20 +625,20 @@ define double @fnmadd_d_2(double %a, double %b, double %c) nounwind { ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: mv s2, a4 +; RV32I-NEXT: mv s3, a5 ; RV32I-NEXT: lui a2, %hi(.LCPI14_0) ; RV32I-NEXT: addi a2, a2, %lo(.LCPI14_0) -; RV32I-NEXT: lw s3, 0(a2) -; RV32I-NEXT: lw s4, 4(a2) -; RV32I-NEXT: mv s5, a5 -; RV32I-NEXT: mv a2, s3 -; RV32I-NEXT: mv a3, s4 +; RV32I-NEXT: lw s4, 0(a2) +; RV32I-NEXT: lw s5, 4(a2) +; RV32I-NEXT: mv a2, s4 +; RV32I-NEXT: mv a3, s5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv s6, a0 ; RV32I-NEXT: mv s7, a1 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s5 -; RV32I-NEXT: mv a2, s3 -; RV32I-NEXT: mv a3, s4 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a2, s4 +; RV32I-NEXT: mv a3, s5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv a4, a0 ; RV32I-NEXT: lui a5, 524288 @@ -670,14 +670,14 @@ define double @fnmadd_d_2(double %a, double %b, double %c) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI14_0) -; RV64I-NEXT: ld s1, %lo(.LCPI14_0)(a1) -; RV64I-NEXT: mv s2, a2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: ld s2, %lo(.LCPI14_0)(a1) +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: mv s3, a0 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: li a1, -1 ; RV64I-NEXT: slli a2, a1, 63 @@ -799,11 +799,11 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind { ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a3 ; RV32I-NEXT: mv s2, a4 +; RV32I-NEXT: mv s3, a5 ; RV32I-NEXT: lui a2, %hi(.LCPI17_0) ; RV32I-NEXT: addi a3, a2, %lo(.LCPI17_0) ; RV32I-NEXT: lw a2, 0(a3) ; RV32I-NEXT: lw a3, 4(a3) -; RV32I-NEXT: mv s3, a5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: lui a2, 524288 ; RV32I-NEXT: xor a1, a1, a2 @@ -827,9 +827,9 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind { ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI17_0) ; RV64I-NEXT: ld a1, %lo(.LCPI17_0)(a1) -; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: li a1, -1 ; RV64I-NEXT: slli a1, a1, 63 @@ -880,11 +880,11 @@ define double @fnmsub_d_2(double %a, double %b, double %c) nounwind { ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: mv s2, a4 +; RV32I-NEXT: mv s3, a5 ; RV32I-NEXT: lui a2, %hi(.LCPI18_0) ; RV32I-NEXT: addi a3, a2, %lo(.LCPI18_0) ; RV32I-NEXT: lw a2, 0(a3) ; RV32I-NEXT: lw a3, 4(a3) -; RV32I-NEXT: mv s3, a5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: lui a3, 524288 @@ -910,9 +910,9 @@ define double @fnmsub_d_2(double %a, double %b, double %c) nounwind { ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI18_0) ; RV64I-NEXT: ld a1, %lo(.LCPI18_0)(a1) -; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: li a1, -1 ; RV64I-NEXT: slli a1, a1, 63 @@ -1009,11 +1009,11 @@ define double @fmsub_d_contract(double %a, double %b, double %c) nounwind { ; RV32I-NEXT: mv s2, a2 ; RV32I-NEXT: mv s3, a3 ; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: lui a1, %hi(.LCPI20_0) -; RV32I-NEXT: addi a1, a1, %lo(.LCPI20_0) -; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a3, 4(a1) ; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: lui a2, %hi(.LCPI20_0) +; RV32I-NEXT: addi a3, a2, %lo(.LCPI20_0) +; RV32I-NEXT: lw a2, 0(a3) +; RV32I-NEXT: lw a3, 4(a3) ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: mv s5, a1 @@ -1044,9 +1044,9 @@ define double @fmsub_d_contract(double %a, double %b, double %c) nounwind { ; RV64I-NEXT: sd s2, 0(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: lui a0, %hi(.LCPI20_0) -; RV64I-NEXT: ld a1, %lo(.LCPI20_0)(a0) ; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: lui a1, %hi(.LCPI20_0) +; RV64I-NEXT: ld a1, %lo(.LCPI20_0)(a1) ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: mv a0, s0 @@ -1108,27 +1108,27 @@ define double @fnmadd_d_contract(double %a, double %b, double %c) nounwind { ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a3 ; RV32I-NEXT: mv s2, a4 +; RV32I-NEXT: mv s3, a5 ; RV32I-NEXT: lui a2, %hi(.LCPI21_0) ; RV32I-NEXT: addi a2, a2, %lo(.LCPI21_0) -; RV32I-NEXT: lw s3, 0(a2) -; RV32I-NEXT: lw s4, 4(a2) -; RV32I-NEXT: mv s5, a5 -; RV32I-NEXT: mv a2, s3 -; RV32I-NEXT: mv a3, s4 +; RV32I-NEXT: lw s4, 0(a2) +; RV32I-NEXT: lw s5, 4(a2) +; RV32I-NEXT: mv a2, s4 +; RV32I-NEXT: mv a3, s5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv s6, a0 ; RV32I-NEXT: mv s7, a1 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: mv a2, s3 -; RV32I-NEXT: mv a3, s4 +; RV32I-NEXT: mv a2, s4 +; RV32I-NEXT: mv a3, s5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s5 -; RV32I-NEXT: mv a2, s3 -; RV32I-NEXT: mv a3, s4 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a2, s4 +; RV32I-NEXT: mv a3, s5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv s2, a0 ; RV32I-NEXT: mv s3, a1 @@ -1163,18 +1163,18 @@ define double @fnmadd_d_contract(double %a, double %b, double %c) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI21_0) -; RV64I-NEXT: ld s1, %lo(.LCPI21_0)(a1) -; RV64I-NEXT: mv s2, a2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: ld s2, %lo(.LCPI21_0)(a1) +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: mv a0, s3 @@ -1237,20 +1237,20 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind { ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a3 ; RV32I-NEXT: mv s2, a4 +; RV32I-NEXT: mv s3, a5 ; RV32I-NEXT: lui a2, %hi(.LCPI22_0) ; RV32I-NEXT: addi a2, a2, %lo(.LCPI22_0) -; RV32I-NEXT: lw s3, 0(a2) -; RV32I-NEXT: lw s4, 4(a2) -; RV32I-NEXT: mv s5, a5 -; RV32I-NEXT: mv a2, s3 -; RV32I-NEXT: mv a3, s4 +; RV32I-NEXT: lw s4, 0(a2) +; RV32I-NEXT: lw s5, 4(a2) +; RV32I-NEXT: mv a2, s4 +; RV32I-NEXT: mv a3, s5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv s6, a0 ; RV32I-NEXT: mv s7, a1 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: mv a2, s3 -; RV32I-NEXT: mv a3, s4 +; RV32I-NEXT: mv a2, s4 +; RV32I-NEXT: mv a3, s5 ; RV32I-NEXT: call __adddf3 ; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: mv a3, a1 @@ -1260,7 +1260,7 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind { ; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __subdf3 ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload @@ -1283,20 +1283,20 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI22_0) -; RV64I-NEXT: ld s1, %lo(.LCPI22_0)(a1) -; RV64I-NEXT: mv s2, a2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: ld s2, %lo(.LCPI22_0)(a1) +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __adddf3 ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __muldf3 ; RV64I-NEXT: mv a1, a0 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __subdf3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll index fdeda0c273f6d..676f0f5ec3eb8 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-arith.ll @@ -414,9 +414,9 @@ define float @fmsub_s(float %a, float %b, float %c) nounwind { ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: lui a0, %hi(.LCPI12_0) -; RV32I-NEXT: lw a1, %lo(.LCPI12_0)(a0) ; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: lui a1, %hi(.LCPI12_0) +; RV32I-NEXT: lw a1, %lo(.LCPI12_0)(a1) ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: lui a2, 524288 ; RV32I-NEXT: xor a2, a0, a2 @@ -437,9 +437,9 @@ define float @fmsub_s(float %a, float %b, float %c) nounwind { ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: lui a0, %hi(.LCPI12_0) -; RV64I-NEXT: lw a1, %lo(.LCPI12_0)(a0) ; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: lui a1, %hi(.LCPI12_0) +; RV64I-NEXT: lw a1, %lo(.LCPI12_0)(a1) ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: lui a2, 524288 ; RV64I-NEXT: xor a2, a0, a2 @@ -475,14 +475,14 @@ define float @fnmadd_s(float %a, float %b, float %c) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: lui a1, %hi(.LCPI13_0) -; RV32I-NEXT: lw s1, %lo(.LCPI13_0)(a1) -; RV32I-NEXT: mv s2, a2 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: lw s2, %lo(.LCPI13_0)(a1) +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: mv s3, a0 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: lui a2, 524288 ; RV32I-NEXT: xor a1, s3, a2 @@ -507,14 +507,14 @@ define float @fnmadd_s(float %a, float %b, float %c) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI13_0) -; RV64I-NEXT: lw s1, %lo(.LCPI13_0)(a1) -; RV64I-NEXT: mv s2, a2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: lw s2, %lo(.LCPI13_0)(a1) +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: mv s3, a0 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: lui a2, 524288 ; RV64I-NEXT: xor a1, s3, a2 @@ -556,14 +556,14 @@ define float @fnmadd_s_2(float %a, float %b, float %c) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: mv a0, a1 +; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: lui a1, %hi(.LCPI14_0) -; RV32I-NEXT: lw s1, %lo(.LCPI14_0)(a1) -; RV32I-NEXT: mv s2, a2 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: lw s2, %lo(.LCPI14_0)(a1) +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: mv s3, a0 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: lui a2, 524288 ; RV32I-NEXT: xor a1, s3, a2 @@ -588,14 +588,14 @@ define float @fnmadd_s_2(float %a, float %b, float %c) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI14_0) -; RV64I-NEXT: lw s1, %lo(.LCPI14_0)(a1) -; RV64I-NEXT: mv s2, a2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: lw s2, %lo(.LCPI14_0)(a1) +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: mv s3, a0 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: lui a2, 524288 ; RV64I-NEXT: xor a1, s3, a2 @@ -720,9 +720,9 @@ define float @fnmsub_s(float %a, float %b, float %c) nounwind { ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: lui a1, %hi(.LCPI17_0) ; RV32I-NEXT: lw a1, %lo(.LCPI17_0)(a1) -; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: lui a1, 524288 ; RV32I-NEXT: xor a0, a0, a1 @@ -742,9 +742,9 @@ define float @fnmsub_s(float %a, float %b, float %c) nounwind { ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI17_0) ; RV64I-NEXT: lw a1, %lo(.LCPI17_0)(a1) -; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: lui a1, 524288 ; RV64I-NEXT: xor a0, a0, a1 @@ -778,9 +778,9 @@ define float @fnmsub_s_2(float %a, float %b, float %c) nounwind { ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: mv a0, a1 +; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: lui a1, %hi(.LCPI18_0) ; RV32I-NEXT: lw a1, %lo(.LCPI18_0)(a1) -; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: lui a1, 524288 ; RV32I-NEXT: xor a1, a0, a1 @@ -801,9 +801,9 @@ define float @fnmsub_s_2(float %a, float %b, float %c) nounwind { ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI18_0) ; RV64I-NEXT: lw a1, %lo(.LCPI18_0)(a1) -; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: lui a1, 524288 ; RV64I-NEXT: xor a1, a0, a1 @@ -877,9 +877,9 @@ define float @fmsub_s_contract(float %a, float %b, float %c) nounwind { ; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: lui a0, %hi(.LCPI20_0) -; RV32I-NEXT: lw a1, %lo(.LCPI20_0)(a0) ; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: lui a1, %hi(.LCPI20_0) +; RV32I-NEXT: lw a1, %lo(.LCPI20_0)(a1) ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: mv s2, a0 ; RV32I-NEXT: mv a0, s0 @@ -903,9 +903,9 @@ define float @fmsub_s_contract(float %a, float %b, float %c) nounwind { ; RV64I-NEXT: sd s2, 0(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: lui a0, %hi(.LCPI20_0) -; RV64I-NEXT: lw a1, %lo(.LCPI20_0)(a0) ; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: lui a1, %hi(.LCPI20_0) +; RV64I-NEXT: lw a1, %lo(.LCPI20_0)(a1) ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: mv s2, a0 ; RV64I-NEXT: mv a0, s0 @@ -946,18 +946,18 @@ define float @fnmadd_s_contract(float %a, float %b, float %c) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: lui a1, %hi(.LCPI21_0) -; RV32I-NEXT: lw s1, %lo(.LCPI21_0)(a1) -; RV32I-NEXT: mv s2, a2 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: lw s2, %lo(.LCPI21_0)(a1) +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: mv a0, s3 @@ -984,18 +984,18 @@ define float @fnmadd_s_contract(float %a, float %b, float %c) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI21_0) -; RV64I-NEXT: lw s1, %lo(.LCPI21_0)(a1) -; RV64I-NEXT: mv s2, a2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: lw s2, %lo(.LCPI21_0)(a1) +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: mv a0, s3 @@ -1039,20 +1039,20 @@ define float @fnmsub_s_contract(float %a, float %b, float %c) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a2 ; RV32I-NEXT: lui a1, %hi(.LCPI22_0) -; RV32I-NEXT: lw s1, %lo(.LCPI22_0)(a1) -; RV32I-NEXT: mv s2, a2 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: lw s2, %lo(.LCPI22_0)(a1) +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call __mulsf3 ; RV32I-NEXT: mv a1, a0 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __subsf3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -1071,20 +1071,20 @@ define float @fnmsub_s_contract(float %a, float %b, float %c) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a2 ; RV64I-NEXT: lui a1, %hi(.LCPI22_0) -; RV64I-NEXT: lw s1, %lo(.LCPI22_0)(a1) -; RV64I-NEXT: mv s2, a2 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: lw s2, %lo(.LCPI22_0)(a1) +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: mv a1, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call __mulsf3 ; RV64I-NEXT: mv a1, a0 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __subsf3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll b/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll index 234f338412066..36ff827ebf32a 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll @@ -142,20 +142,20 @@ define i32 @freeze_anonstruct(ptr %p) { define i32 @freeze_anonstruct2(ptr %p) { ; RV32-LABEL: freeze_anonstruct2: ; RV32: # %bb.0: -; RV32-NEXT: lh a1, 4(a0) -; RV32-NEXT: lw a0, 0(a0) -; RV32-NEXT: slli a1, a1, 16 -; RV32-NEXT: srli a1, a1, 16 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: lw a1, 0(a0) +; RV32-NEXT: lh a0, 4(a0) +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: freeze_anonstruct2: ; RV64: # %bb.0: -; RV64-NEXT: lh a1, 4(a0) -; RV64-NEXT: lw a0, 0(a0) -; RV64-NEXT: slli a1, a1, 48 -; RV64-NEXT: srli a1, a1, 48 -; RV64-NEXT: addw a0, a0, a1 +; RV64-NEXT: lw a1, 0(a0) +; RV64-NEXT: lh a0, 4(a0) +; RV64-NEXT: slli a0, a0, 48 +; RV64-NEXT: srli a0, a0, 48 +; RV64-NEXT: addw a0, a1, a0 ; RV64-NEXT: ret %s = load {i32, i16}, ptr %p %y1 = freeze {i32, i16} %s @@ -169,20 +169,20 @@ define i32 @freeze_anonstruct2(ptr %p) { define i32 @freeze_anonstruct2_sext(ptr %p) { ; RV32-LABEL: freeze_anonstruct2_sext: ; RV32: # %bb.0: -; RV32-NEXT: lh a1, 4(a0) -; RV32-NEXT: lw a0, 0(a0) -; RV32-NEXT: slli a1, a1, 16 -; RV32-NEXT: srai a1, a1, 16 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: lw a1, 0(a0) +; RV32-NEXT: lh a0, 4(a0) +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: srai a0, a0, 16 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: freeze_anonstruct2_sext: ; RV64: # %bb.0: -; RV64-NEXT: lh a1, 4(a0) -; RV64-NEXT: lw a0, 0(a0) -; RV64-NEXT: slli a1, a1, 48 -; RV64-NEXT: srai a1, a1, 48 -; RV64-NEXT: addw a0, a0, a1 +; RV64-NEXT: lw a1, 0(a0) +; RV64-NEXT: lh a0, 4(a0) +; RV64-NEXT: slli a0, a0, 48 +; RV64-NEXT: srai a0, a0, 48 +; RV64-NEXT: addw a0, a1, a0 ; RV64-NEXT: ret %s = load {i32, i16}, ptr %p %y1 = freeze {i32, i16} %s diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir index 42bf321228705..f8061462c6220 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir @@ -10,20 +10,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv1i8 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]] ; RV32I-NEXT: PseudoRET implicit $v8 ; ; RV64I-LABEL: name: select_nxv1i8 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]] ; RV64I-NEXT: PseudoRET implicit $v8 %0:vrb() = G_IMPLICIT_DEF @@ -41,20 +39,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv4i8 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]] ; RV32I-NEXT: PseudoRET implicit $v8 ; ; RV64I-LABEL: name: select_nxv4i8 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]] ; RV64I-NEXT: PseudoRET implicit $v8 %0:vrb() = G_IMPLICIT_DEF @@ -72,20 +68,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv16i8 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV32I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]] ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: select_nxv16i8 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 3 /* e8 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */ ; RV64I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]] ; RV64I-NEXT: PseudoRET implicit $v8m4 %0:vrb() = G_IMPLICIT_DEF @@ -103,20 +97,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv64i8 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]] ; RV32I-NEXT: PseudoRET implicit $v8 ; ; RV64I-LABEL: name: select_nxv64i8 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]] ; RV64I-NEXT: PseudoRET implicit $v8 %0:vrb() = G_IMPLICIT_DEF @@ -134,20 +126,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv2i16 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]] ; RV32I-NEXT: PseudoRET implicit $v8 ; ; RV64I-LABEL: name: select_nxv2i16 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]] ; RV64I-NEXT: PseudoRET implicit $v8 %0:vrb() = G_IMPLICIT_DEF @@ -165,20 +155,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv8i16 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV32I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]] ; RV32I-NEXT: PseudoRET implicit $v8m4 ; ; RV64I-LABEL: name: select_nxv8i16 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm4 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm4nov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 4 /* e16 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_M4_:%[0-9]+]]:vrm4nov0 = PseudoVMERGE_VVM_M4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */ ; RV64I-NEXT: $v8m4 = COPY [[PseudoVMERGE_VVM_M4_]] ; RV64I-NEXT: PseudoRET implicit $v8m4 %0:vrb() = G_IMPLICIT_DEF @@ -196,20 +184,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv32i16 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]] ; RV32I-NEXT: PseudoRET implicit $v8 ; ; RV64I-LABEL: name: select_nxv32i16 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]] ; RV64I-NEXT: PseudoRET implicit $v8 %0:vrb() = G_IMPLICIT_DEF @@ -227,20 +213,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv2i32 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV32I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]] ; RV32I-NEXT: PseudoRET implicit $v8m2 ; ; RV64I-LABEL: name: select_nxv2i32 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV64I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]] ; RV64I-NEXT: PseudoRET implicit $v8m2 %0:vrb() = G_IMPLICIT_DEF @@ -258,20 +242,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv8i32 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV32I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]] ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: select_nxv8i32 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 5 /* e32 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */ ; RV64I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]] ; RV64I-NEXT: PseudoRET implicit $v8m8 %0:vrb() = G_IMPLICIT_DEF @@ -289,20 +271,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv1i64 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 6 /* e64 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */ ; RV32I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]] ; RV32I-NEXT: PseudoRET implicit $v8m2 ; ; RV64I-LABEL: name: select_nxv1i64 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm2 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm2nov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 6 /* e64 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_M2_:%[0-9]+]]:vrm2nov0 = PseudoVMERGE_VVM_M2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */ ; RV64I-NEXT: $v8m2 = COPY [[PseudoVMERGE_VVM_M2_]] ; RV64I-NEXT: PseudoRET implicit $v8m2 %0:vrb() = G_IMPLICIT_DEF @@ -320,20 +300,18 @@ tracksRegLiveness: true body: | bb.0.entry: ; RV32I-LABEL: name: select_nxv4i64 - ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF - ; RV32I-NEXT: $v0 = COPY [[DEF]] - ; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 6 /* e64 */ + ; RV32I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */ ; RV32I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]] ; RV32I-NEXT: PseudoRET implicit $v8m8 ; ; RV64I-LABEL: name: select_nxv4i64 - ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF + ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrm8 = IMPLICIT_DEF ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrm8nov0 = IMPLICIT_DEF - ; RV64I-NEXT: $v0 = COPY [[DEF]] - ; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], $v0, -1, 6 /* e64 */ + ; RV64I-NEXT: [[PseudoVMERGE_VVM_M8_:%[0-9]+]]:vrm8nov0 = PseudoVMERGE_VVM_M8 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 6 /* e64 */ ; RV64I-NEXT: $v8m8 = COPY [[PseudoVMERGE_VVM_M8_]] ; RV64I-NEXT: PseudoRET implicit $v8m8 %0:vrb() = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll index 8a786fc9993d2..6e13179bfe77e 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll @@ -782,8 +782,8 @@ define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind { define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; RV32I-LABEL: rotl_64_mask: ; RV32I: # %bb.0: -; RV32I-NEXT: li a5, 32 ; RV32I-NEXT: neg a4, a2 +; RV32I-NEXT: li a5, 32 ; RV32I-NEXT: bltu a2, a5, .LBB10_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: li a3, 0 @@ -837,8 +837,8 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV32ZBB-LABEL: rotl_64_mask: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: li a5, 32 ; RV32ZBB-NEXT: neg a4, a2 +; RV32ZBB-NEXT: li a5, 32 ; RV32ZBB-NEXT: bltu a2, a5, .LBB10_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: li a3, 0 @@ -892,8 +892,8 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV32XTHEADBB-LABEL: rotl_64_mask: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: li a5, 32 ; RV32XTHEADBB-NEXT: neg a4, a2 +; RV32XTHEADBB-NEXT: li a5, 32 ; RV32XTHEADBB-NEXT: bltu a2, a5, .LBB10_2 ; RV32XTHEADBB-NEXT: # %bb.1: ; RV32XTHEADBB-NEXT: li a3, 0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll index 9a6c718703a27..e1019c63408ee 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll @@ -68,8 +68,8 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: sext.w a1, a0 ; RV64I-NEXT: li s0, 31 +; RV64I-NEXT: sext.w a1, a0 ; RV64I-NEXT: beqz a1, .LBB1_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: srliw a1, a0, 1 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll index 558424b53be95..12afb3adf2f69 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll @@ -115,8 +115,8 @@ define i64 @pack_i64_3(ptr %0, ptr %1) { ; RV64I-LABEL: pack_i64_3: ; RV64I: # %bb.0: ; RV64I-NEXT: lwu a0, 0(a0) -; RV64I-NEXT: lwu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: lwu a1, 0(a1) ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll index 8b262db56ccd2..8bffb0772eeef 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll @@ -503,9 +503,9 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind { define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: shl128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a2, 0(a2) ; RV32I-NEXT: lw a7, 0(a1) ; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a2, 0(a2) ; RV32I-NEXT: li a6, 64 ; RV32I-NEXT: li t1, 32 ; RV32I-NEXT: neg t5, a2 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/RISCV/GlobalISel/stacksave-stackrestore.ll index caa749729ce19..11912483f8d9c 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/stacksave-stackrestore.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/stacksave-stackrestore.ll @@ -17,10 +17,10 @@ define void @test_scoped_alloca(i64 %n) { ; RV32-NEXT: .cfi_offset s1, -12 ; RV32-NEXT: addi s0, sp, 16 ; RV32-NEXT: .cfi_def_cfa s0, 0 +; RV32-NEXT: mv s1, sp ; RV32-NEXT: addi a0, a0, 15 ; RV32-NEXT: andi a0, a0, -16 ; RV32-NEXT: sub a0, sp, a0 -; RV32-NEXT: mv s1, sp ; RV32-NEXT: mv sp, a0 ; RV32-NEXT: call use_addr ; RV32-NEXT: mv sp, s1 @@ -48,10 +48,10 @@ define void @test_scoped_alloca(i64 %n) { ; RV64-NEXT: .cfi_offset s1, -24 ; RV64-NEXT: addi s0, sp, 32 ; RV64-NEXT: .cfi_def_cfa s0, 0 +; RV64-NEXT: mv s1, sp ; RV64-NEXT: addi a0, a0, 15 ; RV64-NEXT: andi a0, a0, -16 ; RV64-NEXT: sub a0, sp, a0 -; RV64-NEXT: mv s1, sp ; RV64-NEXT: mv sp, a0 ; RV64-NEXT: call use_addr ; RV64-NEXT: mv sp, s1 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll index fc9be94988451..ba67b45ebbe7d 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll @@ -49,12 +49,12 @@ define i32 @va1(ptr %fmt, ...) { ; RV32-NEXT: sw a2, 24(sp) ; RV32-NEXT: sw a3, 28(sp) ; RV32-NEXT: sw a4, 32(sp) -; RV32-NEXT: addi a0, sp, 20 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: lw a0, 12(sp) ; RV32-NEXT: sw a5, 36(sp) ; RV32-NEXT: sw a6, 40(sp) ; RV32-NEXT: sw a7, 44(sp) +; RV32-NEXT: addi a0, sp, 20 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: lw a0, 12(sp) ; RV32-NEXT: addi a1, a0, 4 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lw a0, 0(a0) @@ -103,12 +103,12 @@ define i32 @va1(ptr %fmt, ...) { ; RV32-WITHFP-NEXT: sw a2, 8(s0) ; RV32-WITHFP-NEXT: sw a3, 12(s0) ; RV32-WITHFP-NEXT: sw a4, 16(s0) -; RV32-WITHFP-NEXT: addi a0, s0, 4 -; RV32-WITHFP-NEXT: sw a0, -12(s0) -; RV32-WITHFP-NEXT: lw a0, -12(s0) ; RV32-WITHFP-NEXT: sw a5, 20(s0) ; RV32-WITHFP-NEXT: sw a6, 24(s0) ; RV32-WITHFP-NEXT: sw a7, 28(s0) +; RV32-WITHFP-NEXT: addi a0, s0, 4 +; RV32-WITHFP-NEXT: sw a0, -12(s0) +; RV32-WITHFP-NEXT: lw a0, -12(s0) ; RV32-WITHFP-NEXT: addi a1, a0, 4 ; RV32-WITHFP-NEXT: sw a1, -12(s0) ; RV32-WITHFP-NEXT: lw a0, 0(a0) @@ -517,12 +517,12 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; ILP32-NEXT: sw a2, 24(sp) ; ILP32-NEXT: sw a3, 28(sp) ; ILP32-NEXT: sw a4, 32(sp) -; ILP32-NEXT: addi a0, sp, 20 -; ILP32-NEXT: sw a0, 12(sp) -; ILP32-NEXT: lw a0, 12(sp) ; ILP32-NEXT: sw a5, 36(sp) ; ILP32-NEXT: sw a6, 40(sp) ; ILP32-NEXT: sw a7, 44(sp) +; ILP32-NEXT: addi a0, sp, 20 +; ILP32-NEXT: sw a0, 12(sp) +; ILP32-NEXT: lw a0, 12(sp) ; ILP32-NEXT: addi a1, a0, 7 ; ILP32-NEXT: addi a0, a0, 15 ; ILP32-NEXT: andi a1, a1, -8 @@ -635,12 +635,12 @@ define i64 @va2(ptr %fmt, ...) nounwind { ; RV32-WITHFP-NEXT: sw a2, 8(s0) ; RV32-WITHFP-NEXT: sw a3, 12(s0) ; RV32-WITHFP-NEXT: sw a4, 16(s0) -; RV32-WITHFP-NEXT: addi a0, s0, 4 -; RV32-WITHFP-NEXT: sw a0, -12(s0) -; RV32-WITHFP-NEXT: lw a0, -12(s0) ; RV32-WITHFP-NEXT: sw a5, 20(s0) ; RV32-WITHFP-NEXT: sw a6, 24(s0) ; RV32-WITHFP-NEXT: sw a7, 28(s0) +; RV32-WITHFP-NEXT: addi a0, s0, 4 +; RV32-WITHFP-NEXT: sw a0, -12(s0) +; RV32-WITHFP-NEXT: lw a0, -12(s0) ; RV32-WITHFP-NEXT: addi a1, a0, 7 ; RV32-WITHFP-NEXT: addi a0, a0, 15 ; RV32-WITHFP-NEXT: andi a1, a1, -8 @@ -854,14 +854,14 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32-LABEL: va3: ; ILP32: # %bb.0: ; ILP32-NEXT: addi sp, sp, -32 -; ILP32-NEXT: addi a0, sp, 12 -; ILP32-NEXT: sw a0, 4(sp) -; ILP32-NEXT: lw a0, 4(sp) ; ILP32-NEXT: sw a3, 12(sp) ; ILP32-NEXT: sw a4, 16(sp) ; ILP32-NEXT: sw a5, 20(sp) ; ILP32-NEXT: sw a6, 24(sp) ; ILP32-NEXT: sw a7, 28(sp) +; ILP32-NEXT: addi a0, sp, 12 +; ILP32-NEXT: sw a0, 4(sp) +; ILP32-NEXT: lw a0, 4(sp) ; ILP32-NEXT: addi a3, a0, 7 ; ILP32-NEXT: addi a0, a0, 15 ; ILP32-NEXT: andi a3, a3, -8 @@ -956,13 +956,13 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; RV64-LABEL: va3: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: sd a0, 8(sp) -; RV64-NEXT: ld a0, 8(sp) ; RV64-NEXT: sd a2, 16(sp) ; RV64-NEXT: sd a3, 24(sp) ; RV64-NEXT: sd a4, 32(sp) ; RV64-NEXT: sd a5, 40(sp) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: sd a0, 8(sp) +; RV64-NEXT: ld a0, 8(sp) ; RV64-NEXT: sd a6, 48(sp) ; RV64-NEXT: sd a7, 56(sp) ; RV64-NEXT: addi a2, a0, 7 @@ -980,14 +980,14 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; RV32-WITHFP-NEXT: sw ra, 20(sp) # 4-byte Folded Spill ; RV32-WITHFP-NEXT: sw s0, 16(sp) # 4-byte Folded Spill ; RV32-WITHFP-NEXT: addi s0, sp, 24 -; RV32-WITHFP-NEXT: addi a0, s0, 4 -; RV32-WITHFP-NEXT: sw a0, -12(s0) -; RV32-WITHFP-NEXT: lw a0, -12(s0) ; RV32-WITHFP-NEXT: sw a3, 4(s0) ; RV32-WITHFP-NEXT: sw a4, 8(s0) ; RV32-WITHFP-NEXT: sw a5, 12(s0) ; RV32-WITHFP-NEXT: sw a6, 16(s0) ; RV32-WITHFP-NEXT: sw a7, 20(s0) +; RV32-WITHFP-NEXT: addi a0, s0, 4 +; RV32-WITHFP-NEXT: sw a0, -12(s0) +; RV32-WITHFP-NEXT: lw a0, -12(s0) ; RV32-WITHFP-NEXT: addi a3, a0, 7 ; RV32-WITHFP-NEXT: addi a0, a0, 15 ; RV32-WITHFP-NEXT: andi a3, a3, -8 @@ -1009,13 +1009,13 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; RV64-WITHFP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64-WITHFP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64-WITHFP-NEXT: addi s0, sp, 32 -; RV64-WITHFP-NEXT: mv a0, s0 -; RV64-WITHFP-NEXT: sd a0, -24(s0) -; RV64-WITHFP-NEXT: ld a0, -24(s0) ; RV64-WITHFP-NEXT: sd a2, 0(s0) ; RV64-WITHFP-NEXT: sd a3, 8(s0) ; RV64-WITHFP-NEXT: sd a4, 16(s0) ; RV64-WITHFP-NEXT: sd a5, 24(s0) +; RV64-WITHFP-NEXT: mv a0, s0 +; RV64-WITHFP-NEXT: sd a0, -24(s0) +; RV64-WITHFP-NEXT: ld a0, -24(s0) ; RV64-WITHFP-NEXT: sd a6, 32(s0) ; RV64-WITHFP-NEXT: sd a7, 40(s0) ; RV64-WITHFP-NEXT: addi a2, a0, 7 @@ -1233,14 +1233,14 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind { ; RV32-NEXT: addi a0, sp, 36 ; RV32-NEXT: sw a0, 16(sp) ; RV32-NEXT: lw a0, 16(sp) -; RV32-NEXT: addi a0, a0, 3 ; RV32-NEXT: li s0, -4 +; RV32-NEXT: addi a0, a0, 3 ; RV32-NEXT: and a0, a0, s0 ; RV32-NEXT: addi a1, a0, 4 ; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: lw a1, 16(sp) ; RV32-NEXT: lw s1, 0(a0) -; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: lw a0, 16(sp) +; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: lw a0, 12(sp) ; RV32-NEXT: call notdead ; RV32-NEXT: lw a0, 16(sp) @@ -1254,8 +1254,8 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind { ; RV32-NEXT: and a1, a1, s0 ; RV32-NEXT: addi a2, a1, 4 ; RV32-NEXT: sw a2, 16(sp) -; RV32-NEXT: lw a2, 16(sp) ; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a2, 16(sp) ; RV32-NEXT: addi a2, a2, 3 ; RV32-NEXT: andi a2, a2, -4 ; RV32-NEXT: addi a3, a2, 4 @@ -1286,18 +1286,18 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind { ; RV64-NEXT: addi a0, sp, 56 ; RV64-NEXT: sd a0, 16(sp) ; RV64-NEXT: ld a0, 16(sp) -; RV64-NEXT: addi a0, a0, 7 ; RV64-NEXT: li s0, -8 +; RV64-NEXT: addi a0, a0, 7 ; RV64-NEXT: and a0, a0, s0 ; RV64-NEXT: addi a1, a0, 8 ; RV64-NEXT: sd a1, 16(sp) ; RV64-NEXT: ld a1, 16(sp) ; RV64-NEXT: ld s1, 0(a0) ; RV64-NEXT: sd a1, 8(sp) -; RV64-NEXT: lw a0, 12(sp) -; RV64-NEXT: lwu a1, 8(sp) -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: lwu a0, 8(sp) +; RV64-NEXT: lw a1, 12(sp) +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: call notdead ; RV64-NEXT: ld a0, 16(sp) ; RV64-NEXT: addi a0, a0, 7 @@ -1310,8 +1310,8 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind { ; RV64-NEXT: and a1, a1, s0 ; RV64-NEXT: addi a2, a1, 8 ; RV64-NEXT: sd a2, 16(sp) -; RV64-NEXT: ld a2, 16(sp) ; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: ld a2, 16(sp) ; RV64-NEXT: addi a2, a2, 7 ; RV64-NEXT: andi a2, a2, -8 ; RV64-NEXT: addi a3, a2, 8 @@ -1344,14 +1344,14 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind { ; RV32-WITHFP-NEXT: addi a0, s0, 4 ; RV32-WITHFP-NEXT: sw a0, -20(s0) ; RV32-WITHFP-NEXT: lw a0, -20(s0) -; RV32-WITHFP-NEXT: addi a0, a0, 3 ; RV32-WITHFP-NEXT: li s1, -4 +; RV32-WITHFP-NEXT: addi a0, a0, 3 ; RV32-WITHFP-NEXT: and a0, a0, s1 ; RV32-WITHFP-NEXT: addi a1, a0, 4 ; RV32-WITHFP-NEXT: sw a1, -20(s0) -; RV32-WITHFP-NEXT: lw a1, -20(s0) ; RV32-WITHFP-NEXT: lw s2, 0(a0) -; RV32-WITHFP-NEXT: sw a1, -24(s0) +; RV32-WITHFP-NEXT: lw a0, -20(s0) +; RV32-WITHFP-NEXT: sw a0, -24(s0) ; RV32-WITHFP-NEXT: lw a0, -24(s0) ; RV32-WITHFP-NEXT: call notdead ; RV32-WITHFP-NEXT: lw a0, -20(s0) @@ -1365,8 +1365,8 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind { ; RV32-WITHFP-NEXT: and a1, a1, s1 ; RV32-WITHFP-NEXT: addi a2, a1, 4 ; RV32-WITHFP-NEXT: sw a2, -20(s0) -; RV32-WITHFP-NEXT: lw a2, -20(s0) ; RV32-WITHFP-NEXT: lw a1, 0(a1) +; RV32-WITHFP-NEXT: lw a2, -20(s0) ; RV32-WITHFP-NEXT: addi a2, a2, 3 ; RV32-WITHFP-NEXT: andi a2, a2, -4 ; RV32-WITHFP-NEXT: addi a3, a2, 4 @@ -1400,18 +1400,18 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind { ; RV64-WITHFP-NEXT: addi a0, s0, 8 ; RV64-WITHFP-NEXT: sd a0, -40(s0) ; RV64-WITHFP-NEXT: ld a0, -40(s0) -; RV64-WITHFP-NEXT: addi a0, a0, 7 ; RV64-WITHFP-NEXT: li s1, -8 +; RV64-WITHFP-NEXT: addi a0, a0, 7 ; RV64-WITHFP-NEXT: and a0, a0, s1 ; RV64-WITHFP-NEXT: addi a1, a0, 8 ; RV64-WITHFP-NEXT: sd a1, -40(s0) ; RV64-WITHFP-NEXT: ld a1, -40(s0) ; RV64-WITHFP-NEXT: ld s2, 0(a0) ; RV64-WITHFP-NEXT: sd a1, -48(s0) -; RV64-WITHFP-NEXT: lw a0, -44(s0) -; RV64-WITHFP-NEXT: lwu a1, -48(s0) -; RV64-WITHFP-NEXT: slli a0, a0, 32 -; RV64-WITHFP-NEXT: or a0, a0, a1 +; RV64-WITHFP-NEXT: lwu a0, -48(s0) +; RV64-WITHFP-NEXT: lw a1, -44(s0) +; RV64-WITHFP-NEXT: slli a1, a1, 32 +; RV64-WITHFP-NEXT: or a0, a1, a0 ; RV64-WITHFP-NEXT: call notdead ; RV64-WITHFP-NEXT: ld a0, -40(s0) ; RV64-WITHFP-NEXT: addi a0, a0, 7 @@ -1424,8 +1424,8 @@ define iXLen @va4_va_copy(i32 %argno, ...) nounwind { ; RV64-WITHFP-NEXT: and a1, a1, s1 ; RV64-WITHFP-NEXT: addi a2, a1, 8 ; RV64-WITHFP-NEXT: sd a2, -40(s0) -; RV64-WITHFP-NEXT: ld a2, -40(s0) ; RV64-WITHFP-NEXT: ld a1, 0(a1) +; RV64-WITHFP-NEXT: ld a2, -40(s0) ; RV64-WITHFP-NEXT: addi a2, a2, 7 ; RV64-WITHFP-NEXT: andi a2, a2, -8 ; RV64-WITHFP-NEXT: addi a3, a2, 8 @@ -1593,19 +1593,19 @@ define i32 @va_large_stack(ptr %fmt, ...) { ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: sw a4, 288(a0) ; RV32-NEXT: lui a0, 24414 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: sw a5, 292(a0) +; RV32-NEXT: lui a0, 24414 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: sw a6, 296(a0) +; RV32-NEXT: lui a0, 24414 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: sw a7, 300(a0) +; RV32-NEXT: lui a0, 24414 ; RV32-NEXT: addi a0, a0, 276 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: lw a0, 12(sp) -; RV32-NEXT: lui a1, 24414 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: sw a5, 292(a1) -; RV32-NEXT: lui a1, 24414 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: sw a6, 296(a1) -; RV32-NEXT: lui a1, 24414 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: sw a7, 300(a1) ; RV32-NEXT: addi a1, a0, 4 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lw a0, 0(a0) @@ -1682,12 +1682,12 @@ define i32 @va_large_stack(ptr %fmt, ...) { ; RV32-WITHFP-NEXT: sw a2, 8(s0) ; RV32-WITHFP-NEXT: sw a3, 12(s0) ; RV32-WITHFP-NEXT: sw a4, 16(s0) -; RV32-WITHFP-NEXT: addi a1, s0, 4 -; RV32-WITHFP-NEXT: sw a1, 0(a0) -; RV32-WITHFP-NEXT: lw a1, 0(a0) ; RV32-WITHFP-NEXT: sw a5, 20(s0) ; RV32-WITHFP-NEXT: sw a6, 24(s0) ; RV32-WITHFP-NEXT: sw a7, 28(s0) +; RV32-WITHFP-NEXT: addi a1, s0, 4 +; RV32-WITHFP-NEXT: sw a1, 0(a0) +; RV32-WITHFP-NEXT: lw a1, 0(a0) ; RV32-WITHFP-NEXT: addi a2, a1, 4 ; RV32-WITHFP-NEXT: sw a2, 0(a0) ; RV32-WITHFP-NEXT: lw a0, 0(a1) @@ -1869,12 +1869,12 @@ define i32 @va_printf(ptr %fmt, ...) { ; RV32-NEXT: sw a2, 24(sp) ; RV32-NEXT: sw a3, 28(sp) ; RV32-NEXT: sw a4, 32(sp) -; RV32-NEXT: addi a1, sp, 20 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: lw a1, 8(sp) ; RV32-NEXT: sw a5, 36(sp) ; RV32-NEXT: sw a6, 40(sp) ; RV32-NEXT: sw a7, 44(sp) +; RV32-NEXT: addi a1, sp, 20 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lw a1, 8(sp) ; RV32-NEXT: call va_vprintf ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra @@ -1892,12 +1892,12 @@ define i32 @va_printf(ptr %fmt, ...) { ; RV64-NEXT: sd a2, 32(sp) ; RV64-NEXT: sd a3, 40(sp) ; RV64-NEXT: sd a4, 48(sp) -; RV64-NEXT: addi a1, sp, 24 -; RV64-NEXT: sd a1, 0(sp) -; RV64-NEXT: ld a1, 0(sp) ; RV64-NEXT: sd a5, 56(sp) ; RV64-NEXT: sd a6, 64(sp) ; RV64-NEXT: sd a7, 72(sp) +; RV64-NEXT: addi a1, sp, 24 +; RV64-NEXT: sd a1, 0(sp) +; RV64-NEXT: ld a1, 0(sp) ; RV64-NEXT: call va_vprintf ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore ra @@ -1919,12 +1919,12 @@ define i32 @va_printf(ptr %fmt, ...) { ; RV32-WITHFP-NEXT: sw a2, 8(s0) ; RV32-WITHFP-NEXT: sw a3, 12(s0) ; RV32-WITHFP-NEXT: sw a4, 16(s0) -; RV32-WITHFP-NEXT: addi a1, s0, 4 -; RV32-WITHFP-NEXT: sw a1, -12(s0) -; RV32-WITHFP-NEXT: lw a1, -12(s0) ; RV32-WITHFP-NEXT: sw a5, 20(s0) ; RV32-WITHFP-NEXT: sw a6, 24(s0) ; RV32-WITHFP-NEXT: sw a7, 28(s0) +; RV32-WITHFP-NEXT: addi a1, s0, 4 +; RV32-WITHFP-NEXT: sw a1, -12(s0) +; RV32-WITHFP-NEXT: lw a1, -12(s0) ; RV32-WITHFP-NEXT: call va_vprintf ; RV32-WITHFP-NEXT: .cfi_def_cfa sp, 48 ; RV32-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -1949,12 +1949,12 @@ define i32 @va_printf(ptr %fmt, ...) { ; RV64-WITHFP-NEXT: sd a2, 16(s0) ; RV64-WITHFP-NEXT: sd a3, 24(s0) ; RV64-WITHFP-NEXT: sd a4, 32(s0) -; RV64-WITHFP-NEXT: addi a1, s0, 8 -; RV64-WITHFP-NEXT: sd a1, -24(s0) -; RV64-WITHFP-NEXT: ld a1, -24(s0) ; RV64-WITHFP-NEXT: sd a5, 40(s0) ; RV64-WITHFP-NEXT: sd a6, 48(s0) ; RV64-WITHFP-NEXT: sd a7, 56(s0) +; RV64-WITHFP-NEXT: addi a1, s0, 8 +; RV64-WITHFP-NEXT: sd a1, -24(s0) +; RV64-WITHFP-NEXT: ld a1, -24(s0) ; RV64-WITHFP-NEXT: call va_vprintf ; RV64-WITHFP-NEXT: .cfi_def_cfa sp, 96 ; RV64-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll index bc002fee4417c..47c17d615e0f2 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -5,22 +5,22 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_4bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a0, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: lbu a4, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: lbu a7, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a5, 1(a1) +; RV64I-NEXT: lbu a6, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: slli a0, a0, 16 ; RV64I-NEXT: slli a1, a1, 16 ; RV64I-NEXT: or a0, a0, a3 @@ -40,22 +40,22 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_4bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: lbu a7, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: slli a0, a0, 16 ; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: or a0, a0, a3 @@ -82,22 +82,22 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_4bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a0, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: lbu a4, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: lbu a7, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a5, 1(a1) +; RV64I-NEXT: lbu a6, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: slli a0, a0, 16 ; RV64I-NEXT: slli a1, a1, 16 ; RV64I-NEXT: or a0, a0, a3 @@ -117,22 +117,22 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_4bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: lbu a7, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: slli a0, a0, 16 ; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: or a0, a0, a3 @@ -159,22 +159,22 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_4bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lbu a0, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: lbu a4, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: lbu a7, 2(a1) -; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: lbu a4, 0(a1) +; RV64I-NEXT: lbu a5, 1(a1) +; RV64I-NEXT: lbu a6, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: slli a0, a0, 16 ; RV64I-NEXT: slli a1, a1, 16 ; RV64I-NEXT: or a0, a0, a3 @@ -194,22 +194,22 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_4bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: lbu a7, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: slli a0, a0, 16 ; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: or a0, a0, a3 @@ -247,38 +247,38 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a0, 7(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: lbu t2, 2(a1) -; RV64I-NEXT: lbu t3, 3(a1) -; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t2, 3(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 4(a1) -; RV64I-NEXT: lbu t0, 5(a1) -; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: lbu t2, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t3, t3, 8 -; RV64I-NEXT: or t2, t3, t2 -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lui a4, 16 ; RV64I-NEXT: addi a4, a4, -1 ; RV64I-NEXT: slli a0, a0, 16 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a1, a1, 16 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a5, t2, a5 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 @@ -310,54 +310,54 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu a0, 7(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu a0, 7(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: or a6, a0, t0 +; RV32I-NEXT: lbu a0, 0(a1) ; RV32I-NEXT: lbu a7, 1(a1) ; RV32I-NEXT: lbu t0, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or t1, a0, t1 ; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a7, a7, a0 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t0 -; RV32I-NEXT: slli a0, a4, 16 +; RV32I-NEXT: slli a0, a5, 16 ; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: slli a3, t1, 16 -; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a3, a1, 16 +; RV32I-NEXT: or a1, a6, a4 +; RV32I-NEXT: or a3, a3, a7 +; RV32I-NEXT: slli a3, a3, 3 ; RV32I-NEXT: li a4, 32 -; RV32I-NEXT: or a3, a3, a5 -; RV32I-NEXT: bltu a1, a4, .LBB3_2 +; RV32I-NEXT: bltu a3, a4, .LBB3_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srl a5, a3, a1 -; RV32I-NEXT: bnez a1, .LBB3_3 +; RV32I-NEXT: srl a5, a1, a3 +; RV32I-NEXT: bnez a3, .LBB3_3 ; RV32I-NEXT: j .LBB3_4 ; RV32I-NEXT: .LBB3_2: -; RV32I-NEXT: srl a5, a0, a1 -; RV32I-NEXT: neg a6, a1 -; RV32I-NEXT: sll a6, a3, a6 +; RV32I-NEXT: srl a5, a0, a3 +; RV32I-NEXT: neg a6, a3 +; RV32I-NEXT: sll a6, a1, a6 ; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: beqz a1, .LBB3_4 +; RV32I-NEXT: beqz a3, .LBB3_4 ; RV32I-NEXT: .LBB3_3: ; RV32I-NEXT: mv a0, a5 ; RV32I-NEXT: .LBB3_4: -; RV32I-NEXT: bltu a1, a4, .LBB3_6 +; RV32I-NEXT: bltu a3, a4, .LBB3_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: j .LBB3_7 ; RV32I-NEXT: .LBB3_6: -; RV32I-NEXT: srl a1, a3, a1 +; RV32I-NEXT: srl a1, a1, a3 ; RV32I-NEXT: .LBB3_7: ; RV32I-NEXT: srli a3, a0, 16 ; RV32I-NEXT: lui a4, 16 @@ -398,38 +398,38 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a0, 7(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: lbu t2, 2(a1) -; RV64I-NEXT: lbu t3, 3(a1) -; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t2, 3(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 4(a1) -; RV64I-NEXT: lbu t0, 5(a1) -; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: lbu t2, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t3, t3, 8 -; RV64I-NEXT: or t2, t3, t2 -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lui a4, 16 ; RV64I-NEXT: addi a4, a4, -1 ; RV64I-NEXT: slli a0, a0, 16 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a1, a1, 16 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a5, t2, a5 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 @@ -461,34 +461,34 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu a0, 7(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu a0, 7(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, a7, a4 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: lbu a4, 0(a1) ; RV32I-NEXT: lbu a7, 1(a1) ; RV32I-NEXT: lbu t0, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, t1 ; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a7, a7, a4 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t0 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a4, a5, 16 ; RV32I-NEXT: or a4, a4, a3 ; RV32I-NEXT: slli a0, a0, 16 ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: or a3, a1, a6 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: or a3, a1, a7 ; RV32I-NEXT: slli a3, a3, 3 ; RV32I-NEXT: li a1, 32 -; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: bltu a3, a1, .LBB4_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: li a1, 0 @@ -544,38 +544,38 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a0, 7(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 0(a1) -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: lbu t2, 2(a1) -; RV64I-NEXT: lbu t3, 3(a1) -; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t2, 3(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: or a0, a0, t1 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 4(a1) -; RV64I-NEXT: lbu t0, 5(a1) -; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 5(a1) +; RV64I-NEXT: lbu t2, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli t3, t3, 8 -; RV64I-NEXT: or t2, t3, t2 -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lui a4, 16 ; RV64I-NEXT: addi a4, a4, -1 ; RV64I-NEXT: slli a0, a0, 16 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a1, a1, 16 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a5, t2, a5 -; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 @@ -607,54 +607,54 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu a0, 7(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu a0, 7(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: or a6, a0, t0 +; RV32I-NEXT: lbu a0, 0(a1) ; RV32I-NEXT: lbu a7, 1(a1) ; RV32I-NEXT: lbu t0, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or t1, a0, t1 ; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a7, a7, a0 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t0 -; RV32I-NEXT: slli a0, a4, 16 +; RV32I-NEXT: slli a0, a5, 16 ; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: slli a3, t1, 16 -; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a3, a1, 16 +; RV32I-NEXT: or a1, a6, a4 +; RV32I-NEXT: or a3, a3, a7 +; RV32I-NEXT: slli a3, a3, 3 ; RV32I-NEXT: li a4, 32 -; RV32I-NEXT: or a3, a3, a5 -; RV32I-NEXT: bltu a1, a4, .LBB5_2 +; RV32I-NEXT: bltu a3, a4, .LBB5_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sra a5, a3, a1 -; RV32I-NEXT: bnez a1, .LBB5_3 +; RV32I-NEXT: sra a5, a1, a3 +; RV32I-NEXT: bnez a3, .LBB5_3 ; RV32I-NEXT: j .LBB5_4 ; RV32I-NEXT: .LBB5_2: -; RV32I-NEXT: srl a5, a0, a1 -; RV32I-NEXT: neg a6, a1 -; RV32I-NEXT: sll a6, a3, a6 +; RV32I-NEXT: srl a5, a0, a3 +; RV32I-NEXT: neg a6, a3 +; RV32I-NEXT: sll a6, a1, a6 ; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: beqz a1, .LBB5_4 +; RV32I-NEXT: beqz a3, .LBB5_4 ; RV32I-NEXT: .LBB5_3: ; RV32I-NEXT: mv a0, a5 ; RV32I-NEXT: .LBB5_4: -; RV32I-NEXT: bltu a1, a4, .LBB5_6 +; RV32I-NEXT: bltu a3, a4, .LBB5_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: srai a1, a3, 31 +; RV32I-NEXT: srai a1, a1, 31 ; RV32I-NEXT: j .LBB5_7 ; RV32I-NEXT: .LBB5_6: -; RV32I-NEXT: sra a1, a3, a1 +; RV32I-NEXT: sra a1, a1, a3 ; RV32I-NEXT: .LBB5_7: ; RV32I-NEXT: srli a3, a0, 16 ; RV32I-NEXT: lui a4, 16 @@ -686,8 +686,6 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -702,81 +700,81 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu t6, 11(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) -; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 13(a0) +; RV64I-NEXT: lbu t0, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or t1, t4, t3 -; RV64I-NEXT: or t2, t6, t5 -; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or t2, t4, t3 +; RV64I-NEXT: or t3, t6, t5 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 0(a1) ; RV64I-NEXT: lbu t4, 1(a1) ; RV64I-NEXT: lbu t5, 2(a1) ; RV64I-NEXT: lbu t6, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: or a0, a0, s0 -; RV64I-NEXT: or a6, t4, t3 -; RV64I-NEXT: lbu t3, 4(a1) -; RV64I-NEXT: lbu t4, 5(a1) -; RV64I-NEXT: lbu s0, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a7, t4, a7 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: lbu t4, 4(a1) +; RV64I-NEXT: lbu t5, 5(a1) +; RV64I-NEXT: lbu t6, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t5, t5, 8 +; RV64I-NEXT: or t4, t5, t4 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: or a1, a1, t6 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: or a4, t0, a7 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or a5, t3, t2 ; RV64I-NEXT: slli a0, a0, 16 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: slli a1, a1, 16 -; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a1, a1, t4 ; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: slli a6, a0, 32 -; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a7, a0, 32 +; RV64I-NEXT: slli t0, a1, 32 ; RV64I-NEXT: or a0, a4, a3 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: or a1, a7, a5 +; RV64I-NEXT: or a3, t0, a6 +; RV64I-NEXT: slli a3, a3, 3 ; RV64I-NEXT: li a4, 64 -; RV64I-NEXT: or a3, a6, a7 -; RV64I-NEXT: bltu a1, a4, .LBB6_2 +; RV64I-NEXT: bltu a3, a4, .LBB6_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 -; RV64I-NEXT: srl a5, a3, a5 -; RV64I-NEXT: bnez a1, .LBB6_3 +; RV64I-NEXT: subw a5, a3, a4 +; RV64I-NEXT: srl a5, a1, a5 +; RV64I-NEXT: bnez a3, .LBB6_3 ; RV64I-NEXT: j .LBB6_4 ; RV64I-NEXT: .LBB6_2: -; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 -; RV64I-NEXT: sll a6, a3, a6 +; RV64I-NEXT: srl a5, a0, a3 +; RV64I-NEXT: negw a6, a3 +; RV64I-NEXT: sll a6, a1, a6 ; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: beqz a1, .LBB6_4 +; RV64I-NEXT: beqz a3, .LBB6_4 ; RV64I-NEXT: .LBB6_3: ; RV64I-NEXT: mv a0, a5 ; RV64I-NEXT: .LBB6_4: -; RV64I-NEXT: bltu a1, a4, .LBB6_6 +; RV64I-NEXT: bltu a3, a4, .LBB6_6 ; RV64I-NEXT: # %bb.5: ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: j .LBB6_7 ; RV64I-NEXT: .LBB6_6: -; RV64I-NEXT: srl a1, a3, a1 +; RV64I-NEXT: srl a1, a1, a3 ; RV64I-NEXT: .LBB6_7: ; RV64I-NEXT: srli a3, a0, 32 ; RV64I-NEXT: srliw a4, a0, 16 @@ -814,8 +812,6 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a3, 13(a2) ; RV64I-NEXT: sb t4, 14(a2) ; RV64I-NEXT: sb t5, 15(a2) -; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_16bytes: @@ -833,42 +829,42 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: lbu t4, 8(a0) -; RV32I-NEXT: lbu t5, 9(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) ; RV32I-NEXT: slli a7, a7, 8 ; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli s0, t3, 8 ; RV32I-NEXT: or t3, a7, a6 ; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: lbu a6, 12(a0) -; RV32I-NEXT: lbu a7, 13(a0) -; RV32I-NEXT: lbu t2, 14(a0) +; RV32I-NEXT: or a6, s0, t4 +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t2, 13(a0) +; RV32I-NEXT: lbu t4, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or t6, a7, a6 -; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: or t2, t2, a7 +; RV32I-NEXT: or t4, a0, t4 +; RV32I-NEXT: lbu a0, 0(a1) ; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: lbu s0, 2(a1) +; RV32I-NEXT: lbu t6, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or t2, a0, t2 ; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or s1, a7, a6 +; RV32I-NEXT: or s0, a7, a0 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or s0, a1, s0 +; RV32I-NEXT: or t6, a1, t6 ; RV32I-NEXT: li a7, 32 ; RV32I-NEXT: slli a1, a5, 8 ; RV32I-NEXT: slli a0, t0, 8 ; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: or a6, t5, t4 -; RV32I-NEXT: or t0, t2, t6 -; RV32I-NEXT: or a5, s0, s1 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: or a6, t5, a6 +; RV32I-NEXT: or t0, t4, t2 +; RV32I-NEXT: or a5, t6, s0 ; RV32I-NEXT: slli a5, a5, 3 ; RV32I-NEXT: srl t2, a6, a5 ; RV32I-NEXT: neg t5, a5 @@ -1019,8 +1015,6 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_16bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -1035,81 +1029,81 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: lbu t6, 11(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) -; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 13(a0) +; RV64I-NEXT: lbu t0, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or t1, t4, t3 -; RV64I-NEXT: or t2, t6, t5 -; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or t2, t4, t3 +; RV64I-NEXT: or t3, t6, t5 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 0(a1) ; RV64I-NEXT: lbu t4, 1(a1) ; RV64I-NEXT: lbu t5, 2(a1) ; RV64I-NEXT: lbu t6, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: or a0, a0, s0 -; RV64I-NEXT: or a6, t4, t3 -; RV64I-NEXT: lbu t3, 4(a1) -; RV64I-NEXT: lbu t4, 5(a1) -; RV64I-NEXT: lbu s0, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a7, t4, a7 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: lbu t4, 4(a1) +; RV64I-NEXT: lbu t5, 5(a1) +; RV64I-NEXT: lbu t6, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t5, t5, 8 +; RV64I-NEXT: or t4, t5, t4 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: or a1, a1, t6 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: or a4, t0, a7 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or a5, t3, t2 ; RV64I-NEXT: slli a0, a0, 16 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: slli a1, a1, 16 -; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a1, a1, t4 ; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: slli a6, a0, 32 -; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a7, a0, 32 +; RV64I-NEXT: slli t0, a1, 32 ; RV64I-NEXT: or a0, a4, a3 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: slli a1, a1, 5 +; RV64I-NEXT: or a1, a7, a5 +; RV64I-NEXT: or a3, t0, a6 +; RV64I-NEXT: slli a3, a3, 5 ; RV64I-NEXT: li a4, 64 -; RV64I-NEXT: or a3, a6, a7 -; RV64I-NEXT: bltu a1, a4, .LBB7_2 +; RV64I-NEXT: bltu a3, a4, .LBB7_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 -; RV64I-NEXT: srl a5, a3, a5 -; RV64I-NEXT: bnez a1, .LBB7_3 +; RV64I-NEXT: subw a5, a3, a4 +; RV64I-NEXT: srl a5, a1, a5 +; RV64I-NEXT: bnez a3, .LBB7_3 ; RV64I-NEXT: j .LBB7_4 ; RV64I-NEXT: .LBB7_2: -; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 -; RV64I-NEXT: sll a6, a3, a6 +; RV64I-NEXT: srl a5, a0, a3 +; RV64I-NEXT: negw a6, a3 +; RV64I-NEXT: sll a6, a1, a6 ; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: beqz a1, .LBB7_4 +; RV64I-NEXT: beqz a3, .LBB7_4 ; RV64I-NEXT: .LBB7_3: ; RV64I-NEXT: mv a0, a5 ; RV64I-NEXT: .LBB7_4: -; RV64I-NEXT: bltu a1, a4, .LBB7_6 +; RV64I-NEXT: bltu a3, a4, .LBB7_6 ; RV64I-NEXT: # %bb.5: ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: j .LBB7_7 ; RV64I-NEXT: .LBB7_6: -; RV64I-NEXT: srl a1, a3, a1 +; RV64I-NEXT: srl a1, a1, a3 ; RV64I-NEXT: .LBB7_7: ; RV64I-NEXT: srli a3, a0, 32 ; RV64I-NEXT: srliw a4, a0, 16 @@ -1147,8 +1141,6 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: sb a3, 13(a2) ; RV64I-NEXT: sb t4, 14(a2) ; RV64I-NEXT: sb t5, 15(a2) -; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_16bytes_wordOff: @@ -1166,42 +1158,42 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: lbu t4, 8(a0) -; RV32I-NEXT: lbu t5, 9(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) ; RV32I-NEXT: slli a7, a7, 8 ; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli s0, t3, 8 ; RV32I-NEXT: or t3, a7, a6 ; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: lbu a6, 12(a0) -; RV32I-NEXT: lbu a7, 13(a0) -; RV32I-NEXT: lbu t2, 14(a0) +; RV32I-NEXT: or a6, s0, t4 +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t2, 13(a0) +; RV32I-NEXT: lbu t4, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or t6, a7, a6 -; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: or t2, t2, a7 +; RV32I-NEXT: or t4, a0, t4 +; RV32I-NEXT: lbu a0, 0(a1) ; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: lbu s0, 2(a1) +; RV32I-NEXT: lbu t6, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or t2, a0, t2 ; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or s1, a7, a6 +; RV32I-NEXT: or s0, a7, a0 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or s0, a1, s0 +; RV32I-NEXT: or t6, a1, t6 ; RV32I-NEXT: li a7, 32 ; RV32I-NEXT: slli a1, a5, 8 ; RV32I-NEXT: slli a0, t0, 8 ; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: or a6, t5, t4 -; RV32I-NEXT: or t0, t2, t6 -; RV32I-NEXT: or a5, s0, s1 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: or a6, t5, a6 +; RV32I-NEXT: or t0, t4, t2 +; RV32I-NEXT: or a5, t6, s0 ; RV32I-NEXT: slli a5, a5, 5 ; RV32I-NEXT: srl t2, a6, a5 ; RV32I-NEXT: neg t5, a5 @@ -1352,8 +1344,6 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -1368,60 +1358,60 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu t6, 11(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) -; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 13(a0) +; RV64I-NEXT: lbu t0, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or t1, t4, t3 -; RV64I-NEXT: or t2, t6, t5 -; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or t2, t4, t3 +; RV64I-NEXT: or t3, t6, t5 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 0(a1) ; RV64I-NEXT: lbu t4, 1(a1) ; RV64I-NEXT: lbu t5, 2(a1) ; RV64I-NEXT: lbu t6, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: or a0, a0, s0 -; RV64I-NEXT: or a6, t4, t3 -; RV64I-NEXT: lbu t3, 4(a1) -; RV64I-NEXT: lbu t4, 5(a1) -; RV64I-NEXT: lbu s0, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a7, t4, a7 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: lbu t4, 4(a1) +; RV64I-NEXT: lbu t5, 5(a1) +; RV64I-NEXT: lbu t6, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t5, t5, 8 +; RV64I-NEXT: or t4, t5, t4 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: or a1, a1, t6 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: or a4, t0, a7 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or a5, t3, t2 ; RV64I-NEXT: slli a0, a0, 16 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: slli a1, a1, 16 -; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a1, a1, t4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a4, a4, a3 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: slli a3, a1, 3 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a3, a1, a6 +; RV64I-NEXT: slli a3, a3, 3 ; RV64I-NEXT: li a5, 64 -; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: bltu a3, a5, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a1, 0 @@ -1475,8 +1465,6 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a3, 13(a2) ; RV64I-NEXT: sb t4, 14(a2) ; RV64I-NEXT: sb t5, 15(a2) -; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_16bytes: @@ -1485,34 +1473,34 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu t1, 7(a0) ; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: or a7, t1, t0 ; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: lbu t0, 1(a1) ; RV32I-NEXT: lbu t1, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a7, a7, a6 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t2, t0, a6 ; RV32I-NEXT: li a6, 64 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: li t1, 32 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli t2, t0, 16 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: or t0, a4, a3 -; RV32I-NEXT: or a4, t2, a5 -; RV32I-NEXT: or a5, a1, a7 +; RV32I-NEXT: or t0, a5, a3 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: or a5, a1, t2 ; RV32I-NEXT: slli a5, a5, 3 ; RV32I-NEXT: neg t3, a5 ; RV32I-NEXT: srl t4, t0, t3 @@ -1533,8 +1521,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu s3, 11(a0) +; RV32I-NEXT: lbu s0, 11(a0) ; RV32I-NEXT: lbu s1, 15(a0) ; RV32I-NEXT: sub a7, a6, a5 ; RV32I-NEXT: mv a3, a4 @@ -1542,11 +1529,11 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: # %bb.4: ; RV32I-NEXT: mv a3, t5 ; RV32I-NEXT: .LBB8_5: -; RV32I-NEXT: lbu s2, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu s0, 13(a0) -; RV32I-NEXT: lbu t6, 14(a0) -; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s2, 10(a0) +; RV32I-NEXT: lbu t6, 13(a0) +; RV32I-NEXT: lbu t5, 14(a0) ; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: bltu a7, t1, .LBB8_7 ; RV32I-NEXT: # %bb.6: @@ -1557,20 +1544,20 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sll s4, a4, s4 ; RV32I-NEXT: or s4, t4, s4 ; RV32I-NEXT: .LBB8_8: -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: lbu s6, 8(a0) -; RV32I-NEXT: lbu s5, 12(a0) -; RV32I-NEXT: or s3, s3, t5 -; RV32I-NEXT: slli t5, s0, 8 -; RV32I-NEXT: or s1, s1, t6 +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s0, s0, s2 +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or s1, s1, t5 ; RV32I-NEXT: mv t4, t0 ; RV32I-NEXT: beqz a7, .LBB8_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv t4, s4 ; RV32I-NEXT: .LBB8_10: -; RV32I-NEXT: or a0, s2, s6 -; RV32I-NEXT: slli s0, s3, 16 -; RV32I-NEXT: or t6, t5, s5 +; RV32I-NEXT: or a0, s3, s5 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: or t6, t6, s2 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: bltu a7, t1, .LBB8_12 ; RV32I-NEXT: # %bb.11: @@ -1619,7 +1606,6 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s5, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 4(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: bltu a5, a6, .LBB8_24 ; RV32I-NEXT: # %bb.23: @@ -1681,8 +1667,6 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_16bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -1697,60 +1681,60 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: lbu t6, 11(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) -; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 13(a0) +; RV64I-NEXT: lbu t0, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or t1, t4, t3 -; RV64I-NEXT: or t2, t6, t5 -; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or t2, t4, t3 +; RV64I-NEXT: or t3, t6, t5 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 0(a1) ; RV64I-NEXT: lbu t4, 1(a1) ; RV64I-NEXT: lbu t5, 2(a1) ; RV64I-NEXT: lbu t6, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: or a0, a0, s0 -; RV64I-NEXT: or a6, t4, t3 -; RV64I-NEXT: lbu t3, 4(a1) -; RV64I-NEXT: lbu t4, 5(a1) -; RV64I-NEXT: lbu s0, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a7, t4, a7 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: lbu t4, 4(a1) +; RV64I-NEXT: lbu t5, 5(a1) +; RV64I-NEXT: lbu t6, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t5, t5, 8 +; RV64I-NEXT: or t4, t5, t4 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: or a1, a1, t6 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: or a4, t0, a7 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or a5, t3, t2 ; RV64I-NEXT: slli a0, a0, 16 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: slli a1, a1, 16 -; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a1, a1, t4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a4, a4, a3 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: slli a3, a1, 5 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a3, a1, a6 +; RV64I-NEXT: slli a3, a3, 5 ; RV64I-NEXT: li a5, 64 -; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: bltu a3, a5, .LBB9_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a1, 0 @@ -1804,8 +1788,6 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: sb a3, 13(a2) ; RV64I-NEXT: sb t4, 14(a2) ; RV64I-NEXT: sb t5, 15(a2) -; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_16bytes_wordOff: @@ -1814,34 +1796,34 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu t1, 7(a0) ; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: or a7, t1, t0 ; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: lbu t0, 1(a1) ; RV32I-NEXT: lbu t1, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a7, a7, a6 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t2, t0, a6 ; RV32I-NEXT: li a6, 64 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: li t1, 32 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli t2, t0, 16 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 16 -; RV32I-NEXT: or t0, a4, a3 -; RV32I-NEXT: or a4, t2, a5 -; RV32I-NEXT: or a5, a1, a7 +; RV32I-NEXT: or t0, a5, a3 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: or a5, a1, t2 ; RV32I-NEXT: slli a5, a5, 5 ; RV32I-NEXT: neg t3, a5 ; RV32I-NEXT: srl t4, t0, t3 @@ -1862,8 +1844,7 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sw s3, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu s3, 11(a0) +; RV32I-NEXT: lbu s0, 11(a0) ; RV32I-NEXT: lbu s1, 15(a0) ; RV32I-NEXT: sub a7, a6, a5 ; RV32I-NEXT: mv a3, a4 @@ -1871,11 +1852,11 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: # %bb.4: ; RV32I-NEXT: mv a3, t5 ; RV32I-NEXT: .LBB9_5: -; RV32I-NEXT: lbu s2, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu s0, 13(a0) -; RV32I-NEXT: lbu t6, 14(a0) -; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s2, 10(a0) +; RV32I-NEXT: lbu t6, 13(a0) +; RV32I-NEXT: lbu t5, 14(a0) ; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: bltu a7, t1, .LBB9_7 ; RV32I-NEXT: # %bb.6: @@ -1886,20 +1867,20 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sll s4, a4, s4 ; RV32I-NEXT: or s4, t4, s4 ; RV32I-NEXT: .LBB9_8: -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: lbu s6, 8(a0) -; RV32I-NEXT: lbu s5, 12(a0) -; RV32I-NEXT: or s3, s3, t5 -; RV32I-NEXT: slli t5, s0, 8 -; RV32I-NEXT: or s1, s1, t6 +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s0, s0, s2 +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s2, 12(a0) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: or s1, s1, t5 ; RV32I-NEXT: mv t4, t0 ; RV32I-NEXT: beqz a7, .LBB9_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv t4, s4 ; RV32I-NEXT: .LBB9_10: -; RV32I-NEXT: or a0, s2, s6 -; RV32I-NEXT: slli s0, s3, 16 -; RV32I-NEXT: or t6, t5, s5 +; RV32I-NEXT: or a0, s3, s5 +; RV32I-NEXT: slli s0, s0, 16 +; RV32I-NEXT: or t6, t6, s2 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: bltu a7, t1, .LBB9_12 ; RV32I-NEXT: # %bb.11: @@ -1948,7 +1929,6 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: lw s3, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s4, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s5, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 4(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: bltu a5, a6, .LBB9_24 ; RV32I-NEXT: # %bb.23: @@ -2011,8 +1991,6 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_16bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -2027,81 +2005,81 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu t6, 11(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) -; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 13(a0) +; RV64I-NEXT: lbu t0, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or t1, t4, t3 -; RV64I-NEXT: or t2, t6, t5 -; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or t2, t4, t3 +; RV64I-NEXT: or t3, t6, t5 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 0(a1) ; RV64I-NEXT: lbu t4, 1(a1) ; RV64I-NEXT: lbu t5, 2(a1) ; RV64I-NEXT: lbu t6, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: or a0, a0, s0 -; RV64I-NEXT: or a6, t4, t3 -; RV64I-NEXT: lbu t3, 4(a1) -; RV64I-NEXT: lbu t4, 5(a1) -; RV64I-NEXT: lbu s0, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a7, t4, a7 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: lbu t4, 4(a1) +; RV64I-NEXT: lbu t5, 5(a1) +; RV64I-NEXT: lbu t6, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t5, t5, 8 +; RV64I-NEXT: or t4, t5, t4 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: or a1, a1, t6 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: or a4, t0, a7 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or a5, t3, t2 ; RV64I-NEXT: slli a0, a0, 16 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: slli a1, a1, 16 -; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a1, a1, t4 ; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: slli a6, a0, 32 -; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a7, a0, 32 +; RV64I-NEXT: slli t0, a1, 32 ; RV64I-NEXT: or a0, a4, a3 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: or a1, a7, a5 +; RV64I-NEXT: or a3, t0, a6 +; RV64I-NEXT: slli a3, a3, 3 ; RV64I-NEXT: li a4, 64 -; RV64I-NEXT: or a3, a6, a7 -; RV64I-NEXT: bltu a1, a4, .LBB10_2 +; RV64I-NEXT: bltu a3, a4, .LBB10_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 -; RV64I-NEXT: sra a5, a3, a5 -; RV64I-NEXT: bnez a1, .LBB10_3 +; RV64I-NEXT: subw a5, a3, a4 +; RV64I-NEXT: sra a5, a1, a5 +; RV64I-NEXT: bnez a3, .LBB10_3 ; RV64I-NEXT: j .LBB10_4 ; RV64I-NEXT: .LBB10_2: -; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 -; RV64I-NEXT: sll a6, a3, a6 +; RV64I-NEXT: srl a5, a0, a3 +; RV64I-NEXT: negw a6, a3 +; RV64I-NEXT: sll a6, a1, a6 ; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: beqz a1, .LBB10_4 +; RV64I-NEXT: beqz a3, .LBB10_4 ; RV64I-NEXT: .LBB10_3: ; RV64I-NEXT: mv a0, a5 ; RV64I-NEXT: .LBB10_4: -; RV64I-NEXT: bltu a1, a4, .LBB10_6 +; RV64I-NEXT: bltu a3, a4, .LBB10_6 ; RV64I-NEXT: # %bb.5: -; RV64I-NEXT: srai a1, a3, 63 +; RV64I-NEXT: srai a1, a1, 63 ; RV64I-NEXT: j .LBB10_7 ; RV64I-NEXT: .LBB10_6: -; RV64I-NEXT: sra a1, a3, a1 +; RV64I-NEXT: sra a1, a1, a3 ; RV64I-NEXT: .LBB10_7: ; RV64I-NEXT: srli a3, a0, 32 ; RV64I-NEXT: srliw a4, a0, 16 @@ -2139,8 +2117,6 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a3, 13(a2) ; RV64I-NEXT: sb t4, 14(a2) ; RV64I-NEXT: sb t5, 15(a2) -; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_16bytes: @@ -2158,42 +2134,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t0, 6(a0) ; RV32I-NEXT: lbu t1, 7(a0) ; RV32I-NEXT: lbu t4, 8(a0) -; RV32I-NEXT: lbu t5, 9(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) ; RV32I-NEXT: slli a7, a7, 8 ; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli s0, t3, 8 ; RV32I-NEXT: or t3, a7, a6 ; RV32I-NEXT: or t1, t1, t0 -; RV32I-NEXT: lbu a6, 12(a0) -; RV32I-NEXT: lbu a7, 13(a0) -; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: or a6, s0, t4 +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 13(a0) +; RV32I-NEXT: lbu t4, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: lbu a7, 0(a1) -; RV32I-NEXT: lbu t6, 1(a1) -; RV32I-NEXT: lbu s0, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or s1, a0, t0 ; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t6, t6, a7 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: or t6, t0, a7 +; RV32I-NEXT: or a7, a0, t4 +; RV32I-NEXT: lbu a0, 0(a1) +; RV32I-NEXT: lbu t0, 1(a1) +; RV32I-NEXT: lbu t4, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or s0, t0, a0 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or s0, a1, s0 +; RV32I-NEXT: or t4, a1, t4 ; RV32I-NEXT: li t0, 32 ; RV32I-NEXT: slli a1, a5, 8 ; RV32I-NEXT: slli a0, t2, 8 ; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: or a7, t5, t4 -; RV32I-NEXT: or a5, s1, a6 -; RV32I-NEXT: or a6, s0, t6 +; RV32I-NEXT: slli a5, a7, 16 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: or a7, t5, a6 +; RV32I-NEXT: or a5, a5, t6 +; RV32I-NEXT: or a6, t4, s0 ; RV32I-NEXT: slli a6, a6, 3 ; RV32I-NEXT: srl t2, a7, a6 ; RV32I-NEXT: neg t6, a6 @@ -2344,8 +2320,6 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_16bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd s0, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -2360,81 +2334,81 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: lbu t6, 11(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a6, 13(a0) -; RV64I-NEXT: lbu s0, 14(a0) +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 13(a0) +; RV64I-NEXT: lbu t0, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or t1, t4, t3 -; RV64I-NEXT: or t2, t6, t5 -; RV64I-NEXT: lbu t3, 0(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: or t2, t4, t3 +; RV64I-NEXT: or t3, t6, t5 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 0(a1) ; RV64I-NEXT: lbu t4, 1(a1) ; RV64I-NEXT: lbu t5, 2(a1) ; RV64I-NEXT: lbu t6, 3(a1) -; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: or a0, a0, s0 -; RV64I-NEXT: or a6, t4, t3 -; RV64I-NEXT: lbu t3, 4(a1) -; RV64I-NEXT: lbu t4, 5(a1) -; RV64I-NEXT: lbu s0, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: slli t4, t4, 8 -; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a7, t4, a7 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: lbu t4, 4(a1) +; RV64I-NEXT: lbu t5, 5(a1) +; RV64I-NEXT: lbu t6, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t5, t5, 8 +; RV64I-NEXT: or t4, t5, t4 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: or a1, a1, t6 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: or a4, t0, a7 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: or a4, t1, a5 +; RV64I-NEXT: slli t3, t3, 16 +; RV64I-NEXT: or a5, t3, t2 ; RV64I-NEXT: slli a0, a0, 16 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli t5, t5, 16 -; RV64I-NEXT: or a5, t5, a6 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: slli a1, a1, 16 -; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a1, a1, t4 ; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: slli a6, a0, 32 -; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli a7, a0, 32 +; RV64I-NEXT: slli t0, a1, 32 ; RV64I-NEXT: or a0, a4, a3 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: slli a1, a1, 5 +; RV64I-NEXT: or a1, a7, a5 +; RV64I-NEXT: or a3, t0, a6 +; RV64I-NEXT: slli a3, a3, 5 ; RV64I-NEXT: li a4, 64 -; RV64I-NEXT: or a3, a6, a7 -; RV64I-NEXT: bltu a1, a4, .LBB11_2 +; RV64I-NEXT: bltu a3, a4, .LBB11_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 -; RV64I-NEXT: sra a5, a3, a5 -; RV64I-NEXT: bnez a1, .LBB11_3 +; RV64I-NEXT: subw a5, a3, a4 +; RV64I-NEXT: sra a5, a1, a5 +; RV64I-NEXT: bnez a3, .LBB11_3 ; RV64I-NEXT: j .LBB11_4 ; RV64I-NEXT: .LBB11_2: -; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 -; RV64I-NEXT: sll a6, a3, a6 +; RV64I-NEXT: srl a5, a0, a3 +; RV64I-NEXT: negw a6, a3 +; RV64I-NEXT: sll a6, a1, a6 ; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: beqz a1, .LBB11_4 +; RV64I-NEXT: beqz a3, .LBB11_4 ; RV64I-NEXT: .LBB11_3: ; RV64I-NEXT: mv a0, a5 ; RV64I-NEXT: .LBB11_4: -; RV64I-NEXT: bltu a1, a4, .LBB11_6 +; RV64I-NEXT: bltu a3, a4, .LBB11_6 ; RV64I-NEXT: # %bb.5: -; RV64I-NEXT: srai a1, a3, 63 +; RV64I-NEXT: srai a1, a1, 63 ; RV64I-NEXT: j .LBB11_7 ; RV64I-NEXT: .LBB11_6: -; RV64I-NEXT: sra a1, a3, a1 +; RV64I-NEXT: sra a1, a1, a3 ; RV64I-NEXT: .LBB11_7: ; RV64I-NEXT: srli a3, a0, 32 ; RV64I-NEXT: srliw a4, a0, 16 @@ -2472,8 +2446,6 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: sb a3, 13(a2) ; RV64I-NEXT: sb t4, 14(a2) ; RV64I-NEXT: sb t5, 15(a2) -; RV64I-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_16bytes_wordOff: @@ -2491,42 +2463,42 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: lbu t0, 6(a0) ; RV32I-NEXT: lbu t1, 7(a0) ; RV32I-NEXT: lbu t4, 8(a0) -; RV32I-NEXT: lbu t5, 9(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t3, a7, a6 -; RV32I-NEXT: or t1, t1, t0 -; RV32I-NEXT: lbu a6, 12(a0) -; RV32I-NEXT: lbu a7, 13(a0) -; RV32I-NEXT: lbu t0, 14(a0) -; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: lbu a7, 0(a1) -; RV32I-NEXT: lbu t6, 1(a1) -; RV32I-NEXT: lbu s0, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or s1, a0, t0 +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli s0, t3, 8 +; RV32I-NEXT: or t3, a7, a6 +; RV32I-NEXT: or t1, t1, t0 +; RV32I-NEXT: or a6, s0, t4 +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 13(a0) +; RV32I-NEXT: lbu t4, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) ; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t6, t6, a7 +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: or t5, t6, t5 +; RV32I-NEXT: or t6, t0, a7 +; RV32I-NEXT: or a7, a0, t4 +; RV32I-NEXT: lbu a0, 0(a1) +; RV32I-NEXT: lbu t0, 1(a1) +; RV32I-NEXT: lbu t4, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or s0, t0, a0 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or s0, a1, s0 +; RV32I-NEXT: or t4, a1, t4 ; RV32I-NEXT: li t0, 32 ; RV32I-NEXT: slli a1, a5, 8 ; RV32I-NEXT: slli a0, t2, 8 ; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: or a7, t5, t4 -; RV32I-NEXT: or a5, s1, a6 -; RV32I-NEXT: or a6, s0, t6 +; RV32I-NEXT: slli a5, a7, 16 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: or a7, t5, a6 +; RV32I-NEXT: or a5, a5, t6 +; RV32I-NEXT: or a6, t4, s0 ; RV32I-NEXT: slli a6, a6, 5 ; RV32I-NEXT: srl t2, a7, a6 ; RV32I-NEXT: neg t6, a6 @@ -2713,88 +2685,88 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: lbu s8, 20(a0) ; RV64I-NEXT: lbu s9, 21(a0) ; RV64I-NEXT: lbu s10, 22(a0) ; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 ; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli s3, s3, 8 ; RV64I-NEXT: or a4, t4, t3 ; RV64I-NEXT: or a6, t6, t5 -; RV64I-NEXT: or t0, s1, s0 -; RV64I-NEXT: lbu t5, 24(a0) -; RV64I-NEXT: lbu t6, 25(a0) -; RV64I-NEXT: lbu s0, 26(a0) -; RV64I-NEXT: lbu s1, 27(a0) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: or t4, s3, s2 -; RV64I-NEXT: or t2, s5, s4 -; RV64I-NEXT: or t3, s7, s6 -; RV64I-NEXT: lbu s2, 28(a0) -; RV64I-NEXT: lbu s3, 29(a0) -; RV64I-NEXT: lbu s4, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: slli s4, s4, 8 ; RV64I-NEXT: or s6, s11, s10 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 ; RV64I-NEXT: lbu t6, 0(a1) -; RV64I-NEXT: lbu s1, 1(a1) -; RV64I-NEXT: lbu s7, 2(a1) -; RV64I-NEXT: lbu s8, 3(a1) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: or s3, a0, s4 -; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or s5, a0, s5 +; RV64I-NEXT: or t6, s3, t6 +; RV64I-NEXT: or s3, s7, s4 ; RV64I-NEXT: lbu a0, 4(a1) -; RV64I-NEXT: lbu s1, 5(a1) -; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: or s7, s8, s7 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, a0 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or s4, a1, s4 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: or a1, t1, a7 -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: or a0, t4, t0 -; RV64I-NEXT: slli t3, t3, 16 -; RV64I-NEXT: or t0, t3, t2 +; RV64I-NEXT: or s7, a1, s7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a1, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a0, t2, t1 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: or s0, s1, s0 ; RV64I-NEXT: slli s6, s6, 16 -; RV64I-NEXT: or t1, s6, s5 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: or t3, s0, t5 +; RV64I-NEXT: or t0, s6, s2 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: or t3, s5, t5 ; RV64I-NEXT: slli s3, s3, 16 -; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: or t5, s3, t6 ; RV64I-NEXT: slli s7, s7, 16 -; RV64I-NEXT: or t6, s7, t6 -; RV64I-NEXT: slli s4, s4, 16 -; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: or t6, s7, s4 ; RV64I-NEXT: li a7, 64 ; RV64I-NEXT: slli t4, a5, 16 ; RV64I-NEXT: slli t2, a6, 16 -; RV64I-NEXT: slli t1, t1, 32 -; RV64I-NEXT: slli t5, t5, 32 -; RV64I-NEXT: slli s0, s0, 32 -; RV64I-NEXT: or a6, t1, t0 -; RV64I-NEXT: or t0, t5, t3 -; RV64I-NEXT: or a5, s0, t6 +; RV64I-NEXT: slli t0, t0, 32 +; RV64I-NEXT: slli t3, t3, 32 +; RV64I-NEXT: slli t6, t6, 32 +; RV64I-NEXT: or a6, t0, s0 +; RV64I-NEXT: or t0, t3, t1 +; RV64I-NEXT: or a5, t6, t5 ; RV64I-NEXT: slli a5, a5, 3 ; RV64I-NEXT: subw t1, a5, a7 ; RV64I-NEXT: negw t5, a5 @@ -3008,49 +2980,49 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t0, 21(a0) ; RV32I-NEXT: lbu t1, 22(a0) ; RV32I-NEXT: lbu t2, 23(a0) -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu t4, 25(a0) -; RV32I-NEXT: lbu t5, 26(a0) -; RV32I-NEXT: lbu t6, 27(a0) ; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 24(a0) +; RV32I-NEXT: lbu t3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) ; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: or a6, a6, a5 ; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a4, t2, t1 -; RV32I-NEXT: lbu a7, 28(a0) -; RV32I-NEXT: lbu t0, 29(a0) -; RV32I-NEXT: lbu t1, 30(a0) -; RV32I-NEXT: lbu t2, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or t4, t4, t3 -; RV32I-NEXT: or t5, t6, t5 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or a4, t3, a4 +; RV32I-NEXT: lbu t0, 28(a0) +; RV32I-NEXT: lbu t1, 29(a0) +; RV32I-NEXT: lbu t2, 30(a0) +; RV32I-NEXT: lbu t3, 31(a0) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: lbu t2, 0(a1) ; RV32I-NEXT: lbu t3, 1(a1) -; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu t5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t0, t3, t0 +; RV32I-NEXT: or t2, t3, t2 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: or t5, a1, t5 ; RV32I-NEXT: li t3, 32 ; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a1, a4, 16 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli a4, t1, 16 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: or t1, t5, t4 -; RV32I-NEXT: or t5, a4, a7 -; RV32I-NEXT: or a4, t2, t0 +; RV32I-NEXT: slli a1, a7, 16 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a7, t1, 16 +; RV32I-NEXT: slli t6, t5, 16 +; RV32I-NEXT: or t1, t4, a4 +; RV32I-NEXT: or t5, a7, t0 +; RV32I-NEXT: or a4, t6, t2 ; RV32I-NEXT: slli a4, a4, 3 -; RV32I-NEXT: srl s0, t1, a4 +; RV32I-NEXT: srl s1, t1, a4 ; RV32I-NEXT: neg s6, a4 ; RV32I-NEXT: sll t4, t5, s6 ; RV32I-NEXT: bltu a4, t3, .LBB12_2 @@ -3058,7 +3030,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srl a7, t5, a4 ; RV32I-NEXT: j .LBB12_3 ; RV32I-NEXT: .LBB12_2: -; RV32I-NEXT: or a7, s0, t4 +; RV32I-NEXT: or a7, s1, t4 ; RV32I-NEXT: .LBB12_3: ; RV32I-NEXT: or t0, a6, a3 ; RV32I-NEXT: or a6, a1, a5 @@ -3072,11 +3044,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu a4, t3, .LBB12_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: li s0, 0 ; RV32I-NEXT: srl a3, a6, a4 ; RV32I-NEXT: j .LBB12_8 ; RV32I-NEXT: .LBB12_7: -; RV32I-NEXT: srl ra, t5, a4 +; RV32I-NEXT: srl s0, t5, a4 ; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: .LBB12_8: ; RV32I-NEXT: li t6, 64 @@ -3113,29 +3085,29 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: mv t4, a3 ; RV32I-NEXT: .LBB12_18: ; RV32I-NEXT: neg s11, s9 -; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s9, t3, .LBB12_20 ; RV32I-NEXT: # %bb.19: -; RV32I-NEXT: srl s2, t5, s9 +; RV32I-NEXT: srl s1, t5, s9 ; RV32I-NEXT: j .LBB12_21 ; RV32I-NEXT: .LBB12_20: ; RV32I-NEXT: sll a3, t5, s11 -; RV32I-NEXT: or s2, s0, a3 +; RV32I-NEXT: or s1, s1, a3 ; RV32I-NEXT: .LBB12_21: -; RV32I-NEXT: lbu s1, 11(a0) +; RV32I-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s2, 11(a0) ; RV32I-NEXT: lbu a3, 15(a0) ; RV32I-NEXT: mv s0, t1 ; RV32I-NEXT: beqz s9, .LBB12_23 ; RV32I-NEXT: # %bb.22: -; RV32I-NEXT: mv s0, s2 +; RV32I-NEXT: mv s0, s1 ; RV32I-NEXT: .LBB12_23: -; RV32I-NEXT: lbu s4, 9(a0) -; RV32I-NEXT: lbu s2, 10(a0) -; RV32I-NEXT: lbu s5, 13(a0) -; RV32I-NEXT: lbu s8, 14(a0) -; RV32I-NEXT: slli s3, s1, 8 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s4, 10(a0) +; RV32I-NEXT: lbu s8, 13(a0) +; RV32I-NEXT: lbu ra, 14(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s9, t3, .LBB12_25 ; RV32I-NEXT: # %bb.24: ; RV32I-NEXT: li s1, 0 @@ -3143,12 +3115,12 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB12_25: ; RV32I-NEXT: srl s1, t5, a4 ; RV32I-NEXT: .LBB12_26: -; RV32I-NEXT: or s2, s3, s2 -; RV32I-NEXT: lbu ra, 8(a0) -; RV32I-NEXT: lbu s3, 12(a0) -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: or s8, a3, s8 +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s2, s2, s4 +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s4, 12(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or ra, a3, ra ; RV32I-NEXT: bgeu a4, t6, .LBB12_28 ; RV32I-NEXT: # %bb.27: ; RV32I-NEXT: or s0, a7, t2 @@ -3156,10 +3128,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB12_28: ; RV32I-NEXT: lbu a3, 3(a0) ; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: or a5, s4, ra +; RV32I-NEXT: or a5, s3, s5 ; RV32I-NEXT: slli t4, s2, 16 -; RV32I-NEXT: or s2, s5, s3 -; RV32I-NEXT: slli s3, s8, 16 +; RV32I-NEXT: or s2, s8, s4 +; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: mv s4, t0 ; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beqz a4, .LBB12_30 @@ -3167,25 +3139,25 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: mv s4, s0 ; RV32I-NEXT: mv a7, s1 ; RV32I-NEXT: .LBB12_30: -; RV32I-NEXT: slli s5, a3, 8 -; RV32I-NEXT: lbu ra, 1(a0) +; RV32I-NEXT: slli s3, a3, 8 +; RV32I-NEXT: lbu s8, 1(a0) ; RV32I-NEXT: lbu a3, 2(a0) ; RV32I-NEXT: lbu s1, 5(a0) ; RV32I-NEXT: lbu s0, 6(a0) -; RV32I-NEXT: slli s8, t2, 8 +; RV32I-NEXT: slli s5, t2, 8 ; RV32I-NEXT: or t4, t4, a5 -; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: or t2, ra, s2 ; RV32I-NEXT: bltu a4, t6, .LBB12_32 ; RV32I-NEXT: # %bb.31: ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: .LBB12_32: -; RV32I-NEXT: slli s3, ra, 8 -; RV32I-NEXT: or a5, s5, a3 -; RV32I-NEXT: lbu s5, 0(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or a5, s3, a3 +; RV32I-NEXT: lbu s3, 0(a0) ; RV32I-NEXT: lbu a0, 4(a0) ; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or a3, s8, s0 +; RV32I-NEXT: or a3, s5, s0 ; RV32I-NEXT: srl s2, t4, a4 ; RV32I-NEXT: sll ra, t2, s6 ; RV32I-NEXT: bltu a4, t3, .LBB12_34 @@ -3195,7 +3167,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB12_34: ; RV32I-NEXT: or s0, s2, ra ; RV32I-NEXT: .LBB12_35: -; RV32I-NEXT: or s3, s3, s5 +; RV32I-NEXT: or s3, s8, s3 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: slli a3, a3, 16 @@ -3637,88 +3609,88 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: lbu s8, 20(a0) ; RV64I-NEXT: lbu s9, 21(a0) ; RV64I-NEXT: lbu s10, 22(a0) ; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 ; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli s3, s3, 8 ; RV64I-NEXT: or a4, t4, t3 ; RV64I-NEXT: or a6, t6, t5 -; RV64I-NEXT: or t0, s1, s0 -; RV64I-NEXT: lbu t5, 24(a0) -; RV64I-NEXT: lbu t6, 25(a0) -; RV64I-NEXT: lbu s0, 26(a0) -; RV64I-NEXT: lbu s1, 27(a0) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: or t4, s3, s2 -; RV64I-NEXT: or t2, s5, s4 -; RV64I-NEXT: or t3, s7, s6 -; RV64I-NEXT: lbu s2, 28(a0) -; RV64I-NEXT: lbu s3, 29(a0) -; RV64I-NEXT: lbu s4, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: slli s4, s4, 8 ; RV64I-NEXT: or s6, s11, s10 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 ; RV64I-NEXT: lbu t6, 0(a1) -; RV64I-NEXT: lbu s1, 1(a1) -; RV64I-NEXT: lbu s7, 2(a1) -; RV64I-NEXT: lbu s8, 3(a1) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: or s3, a0, s4 -; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or s5, a0, s5 +; RV64I-NEXT: or t6, s3, t6 +; RV64I-NEXT: or s3, s7, s4 ; RV64I-NEXT: lbu a0, 4(a1) -; RV64I-NEXT: lbu s1, 5(a1) -; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: or s7, s8, s7 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, a0 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or s4, a1, s4 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: or a1, t1, a7 -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: or a0, t4, t0 -; RV64I-NEXT: slli t3, t3, 16 -; RV64I-NEXT: or t0, t3, t2 +; RV64I-NEXT: or s7, a1, s7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a1, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a0, t2, t1 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: or s0, s1, s0 ; RV64I-NEXT: slli s6, s6, 16 -; RV64I-NEXT: or t1, s6, s5 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: or t3, s0, t5 +; RV64I-NEXT: or t0, s6, s2 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: or t3, s5, t5 ; RV64I-NEXT: slli s3, s3, 16 -; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: or t5, s3, t6 ; RV64I-NEXT: slli s7, s7, 16 -; RV64I-NEXT: or t6, s7, t6 -; RV64I-NEXT: slli s4, s4, 16 -; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: or t6, s7, s4 ; RV64I-NEXT: li a7, 64 ; RV64I-NEXT: slli t4, a5, 16 ; RV64I-NEXT: slli t2, a6, 16 -; RV64I-NEXT: slli t1, t1, 32 -; RV64I-NEXT: slli t5, t5, 32 -; RV64I-NEXT: slli s0, s0, 32 -; RV64I-NEXT: or a6, t1, t0 -; RV64I-NEXT: or t0, t5, t3 -; RV64I-NEXT: or a5, s0, t6 +; RV64I-NEXT: slli t0, t0, 32 +; RV64I-NEXT: slli t3, t3, 32 +; RV64I-NEXT: slli t6, t6, 32 +; RV64I-NEXT: or a6, t0, s0 +; RV64I-NEXT: or t0, t3, t1 +; RV64I-NEXT: or a5, t6, t5 ; RV64I-NEXT: slli a5, a5, 5 ; RV64I-NEXT: subw t1, a5, a7 ; RV64I-NEXT: negw t5, a5 @@ -3932,49 +3904,49 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: lbu t0, 21(a0) ; RV32I-NEXT: lbu t1, 22(a0) ; RV32I-NEXT: lbu t2, 23(a0) -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu t4, 25(a0) -; RV32I-NEXT: lbu t5, 26(a0) -; RV32I-NEXT: lbu t6, 27(a0) ; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 24(a0) +; RV32I-NEXT: lbu t3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) ; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: or a6, a6, a5 ; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a4, t2, t1 -; RV32I-NEXT: lbu a7, 28(a0) -; RV32I-NEXT: lbu t0, 29(a0) -; RV32I-NEXT: lbu t1, 30(a0) -; RV32I-NEXT: lbu t2, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or t4, t4, t3 -; RV32I-NEXT: or t5, t6, t5 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or a4, t3, a4 +; RV32I-NEXT: lbu t0, 28(a0) +; RV32I-NEXT: lbu t1, 29(a0) +; RV32I-NEXT: lbu t2, 30(a0) +; RV32I-NEXT: lbu t3, 31(a0) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: lbu t2, 0(a1) ; RV32I-NEXT: lbu t3, 1(a1) -; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu t5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t0, t3, t0 +; RV32I-NEXT: or t2, t3, t2 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: or t5, a1, t5 ; RV32I-NEXT: li t3, 32 ; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a1, a4, 16 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli a4, t1, 16 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: or t1, t5, t4 -; RV32I-NEXT: or t5, a4, a7 -; RV32I-NEXT: or a4, t2, t0 +; RV32I-NEXT: slli a1, a7, 16 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a7, t1, 16 +; RV32I-NEXT: slli t6, t5, 16 +; RV32I-NEXT: or t1, t4, a4 +; RV32I-NEXT: or t5, a7, t0 +; RV32I-NEXT: or a4, t6, t2 ; RV32I-NEXT: slli a4, a4, 5 -; RV32I-NEXT: srl s0, t1, a4 +; RV32I-NEXT: srl s1, t1, a4 ; RV32I-NEXT: neg s6, a4 ; RV32I-NEXT: sll t4, t5, s6 ; RV32I-NEXT: bltu a4, t3, .LBB13_2 @@ -3982,7 +3954,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: srl a7, t5, a4 ; RV32I-NEXT: j .LBB13_3 ; RV32I-NEXT: .LBB13_2: -; RV32I-NEXT: or a7, s0, t4 +; RV32I-NEXT: or a7, s1, t4 ; RV32I-NEXT: .LBB13_3: ; RV32I-NEXT: or t0, a6, a3 ; RV32I-NEXT: or a6, a1, a5 @@ -3996,11 +3968,11 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu a4, t3, .LBB13_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: li s0, 0 ; RV32I-NEXT: srl a3, a6, a4 ; RV32I-NEXT: j .LBB13_8 ; RV32I-NEXT: .LBB13_7: -; RV32I-NEXT: srl ra, t5, a4 +; RV32I-NEXT: srl s0, t5, a4 ; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: .LBB13_8: ; RV32I-NEXT: li t6, 64 @@ -4037,29 +4009,29 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: mv t4, a3 ; RV32I-NEXT: .LBB13_18: ; RV32I-NEXT: neg s11, s9 -; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s9, t3, .LBB13_20 ; RV32I-NEXT: # %bb.19: -; RV32I-NEXT: srl s2, t5, s9 +; RV32I-NEXT: srl s1, t5, s9 ; RV32I-NEXT: j .LBB13_21 ; RV32I-NEXT: .LBB13_20: ; RV32I-NEXT: sll a3, t5, s11 -; RV32I-NEXT: or s2, s0, a3 +; RV32I-NEXT: or s1, s1, a3 ; RV32I-NEXT: .LBB13_21: -; RV32I-NEXT: lbu s1, 11(a0) +; RV32I-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s2, 11(a0) ; RV32I-NEXT: lbu a3, 15(a0) ; RV32I-NEXT: mv s0, t1 ; RV32I-NEXT: beqz s9, .LBB13_23 ; RV32I-NEXT: # %bb.22: -; RV32I-NEXT: mv s0, s2 +; RV32I-NEXT: mv s0, s1 ; RV32I-NEXT: .LBB13_23: -; RV32I-NEXT: lbu s4, 9(a0) -; RV32I-NEXT: lbu s2, 10(a0) -; RV32I-NEXT: lbu s5, 13(a0) -; RV32I-NEXT: lbu s8, 14(a0) -; RV32I-NEXT: slli s3, s1, 8 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s4, 10(a0) +; RV32I-NEXT: lbu s8, 13(a0) +; RV32I-NEXT: lbu ra, 14(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s9, t3, .LBB13_25 ; RV32I-NEXT: # %bb.24: ; RV32I-NEXT: li s1, 0 @@ -4067,12 +4039,12 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: .LBB13_25: ; RV32I-NEXT: srl s1, t5, a4 ; RV32I-NEXT: .LBB13_26: -; RV32I-NEXT: or s2, s3, s2 -; RV32I-NEXT: lbu ra, 8(a0) -; RV32I-NEXT: lbu s3, 12(a0) -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: or s8, a3, s8 +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s2, s2, s4 +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s4, 12(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or ra, a3, ra ; RV32I-NEXT: bgeu a4, t6, .LBB13_28 ; RV32I-NEXT: # %bb.27: ; RV32I-NEXT: or s0, a7, t2 @@ -4080,10 +4052,10 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: .LBB13_28: ; RV32I-NEXT: lbu a3, 3(a0) ; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: or a5, s4, ra +; RV32I-NEXT: or a5, s3, s5 ; RV32I-NEXT: slli t4, s2, 16 -; RV32I-NEXT: or s2, s5, s3 -; RV32I-NEXT: slli s3, s8, 16 +; RV32I-NEXT: or s2, s8, s4 +; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: mv s4, t0 ; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beqz a4, .LBB13_30 @@ -4091,25 +4063,25 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: mv s4, s0 ; RV32I-NEXT: mv a7, s1 ; RV32I-NEXT: .LBB13_30: -; RV32I-NEXT: slli s5, a3, 8 -; RV32I-NEXT: lbu ra, 1(a0) +; RV32I-NEXT: slli s3, a3, 8 +; RV32I-NEXT: lbu s8, 1(a0) ; RV32I-NEXT: lbu a3, 2(a0) ; RV32I-NEXT: lbu s1, 5(a0) ; RV32I-NEXT: lbu s0, 6(a0) -; RV32I-NEXT: slli s8, t2, 8 +; RV32I-NEXT: slli s5, t2, 8 ; RV32I-NEXT: or t4, t4, a5 -; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: or t2, ra, s2 ; RV32I-NEXT: bltu a4, t6, .LBB13_32 ; RV32I-NEXT: # %bb.31: ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: .LBB13_32: -; RV32I-NEXT: slli s3, ra, 8 -; RV32I-NEXT: or a5, s5, a3 -; RV32I-NEXT: lbu s5, 0(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or a5, s3, a3 +; RV32I-NEXT: lbu s3, 0(a0) ; RV32I-NEXT: lbu a0, 4(a0) ; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or a3, s8, s0 +; RV32I-NEXT: or a3, s5, s0 ; RV32I-NEXT: srl s2, t4, a4 ; RV32I-NEXT: sll ra, t2, s6 ; RV32I-NEXT: bltu a4, t3, .LBB13_34 @@ -4119,7 +4091,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: .LBB13_34: ; RV32I-NEXT: or s0, s2, ra ; RV32I-NEXT: .LBB13_35: -; RV32I-NEXT: or s3, s3, s5 +; RV32I-NEXT: or s3, s8, s3 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: slli a3, a3, 16 @@ -4561,88 +4533,88 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: lbu s8, 20(a0) ; RV64I-NEXT: lbu s9, 21(a0) ; RV64I-NEXT: lbu s10, 22(a0) ; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 ; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli s3, s3, 8 ; RV64I-NEXT: or a4, t4, t3 ; RV64I-NEXT: or a6, t6, t5 -; RV64I-NEXT: or t0, s1, s0 -; RV64I-NEXT: lbu t5, 24(a0) -; RV64I-NEXT: lbu t6, 25(a0) -; RV64I-NEXT: lbu s0, 26(a0) -; RV64I-NEXT: lbu s1, 27(a0) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: or t4, s3, s2 -; RV64I-NEXT: or t2, s5, s4 -; RV64I-NEXT: or t3, s7, s6 -; RV64I-NEXT: lbu s2, 28(a0) -; RV64I-NEXT: lbu s3, 29(a0) -; RV64I-NEXT: lbu s4, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: slli s4, s4, 8 ; RV64I-NEXT: or s6, s11, s10 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 ; RV64I-NEXT: lbu t6, 0(a1) -; RV64I-NEXT: lbu s1, 1(a1) -; RV64I-NEXT: lbu s7, 2(a1) -; RV64I-NEXT: lbu s8, 3(a1) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: or s3, a0, s4 -; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or s5, a0, s5 +; RV64I-NEXT: or t6, s3, t6 +; RV64I-NEXT: or s3, s7, s4 ; RV64I-NEXT: lbu a0, 4(a1) -; RV64I-NEXT: lbu s1, 5(a1) -; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: or s7, s8, s7 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, a0 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or s4, a1, s4 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: or a1, t1, a7 -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: or a0, t4, t0 -; RV64I-NEXT: slli t3, t3, 16 -; RV64I-NEXT: or t0, t3, t2 +; RV64I-NEXT: or s7, a1, s7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a1, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a0, t2, t1 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: or s0, s1, s0 ; RV64I-NEXT: slli s6, s6, 16 -; RV64I-NEXT: or t1, s6, s5 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: or t3, s0, t5 +; RV64I-NEXT: or t0, s6, s2 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: or t3, s5, t5 ; RV64I-NEXT: slli s3, s3, 16 -; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: or t5, s3, t6 ; RV64I-NEXT: slli s7, s7, 16 -; RV64I-NEXT: or t6, s7, t6 -; RV64I-NEXT: slli s4, s4, 16 -; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: or t6, s7, s4 ; RV64I-NEXT: li a7, 64 ; RV64I-NEXT: slli t4, a5, 16 ; RV64I-NEXT: slli t2, a6, 16 -; RV64I-NEXT: slli t1, t1, 32 -; RV64I-NEXT: slli t5, t5, 32 -; RV64I-NEXT: slli s0, s0, 32 -; RV64I-NEXT: or a6, t1, t0 -; RV64I-NEXT: or t0, t5, t3 -; RV64I-NEXT: or a5, s0, t6 +; RV64I-NEXT: slli t0, t0, 32 +; RV64I-NEXT: slli t3, t3, 32 +; RV64I-NEXT: slli t6, t6, 32 +; RV64I-NEXT: or a6, t0, s0 +; RV64I-NEXT: or t0, t3, t1 +; RV64I-NEXT: or a5, t6, t5 ; RV64I-NEXT: slli a5, a5, 6 ; RV64I-NEXT: subw t1, a5, a7 ; RV64I-NEXT: negw t5, a5 @@ -4856,49 +4828,49 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: lbu t0, 21(a0) ; RV32I-NEXT: lbu t1, 22(a0) ; RV32I-NEXT: lbu t2, 23(a0) -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu t4, 25(a0) -; RV32I-NEXT: lbu t5, 26(a0) -; RV32I-NEXT: lbu t6, 27(a0) ; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 24(a0) +; RV32I-NEXT: lbu t3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) ; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: or a6, a6, a5 ; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a4, t2, t1 -; RV32I-NEXT: lbu a7, 28(a0) -; RV32I-NEXT: lbu t0, 29(a0) -; RV32I-NEXT: lbu t1, 30(a0) -; RV32I-NEXT: lbu t2, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or t4, t4, t3 -; RV32I-NEXT: or t5, t6, t5 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or a4, t3, a4 +; RV32I-NEXT: lbu t0, 28(a0) +; RV32I-NEXT: lbu t1, 29(a0) +; RV32I-NEXT: lbu t2, 30(a0) +; RV32I-NEXT: lbu t3, 31(a0) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: lbu t2, 0(a1) ; RV32I-NEXT: lbu t3, 1(a1) -; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu t5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: or t0, t3, t0 +; RV32I-NEXT: or t2, t3, t2 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: or t5, a1, t5 ; RV32I-NEXT: li t3, 32 ; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a1, a4, 16 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli a4, t1, 16 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: or t1, t5, t4 -; RV32I-NEXT: or t5, a4, a7 -; RV32I-NEXT: or a4, t2, t0 +; RV32I-NEXT: slli a1, a7, 16 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a7, t1, 16 +; RV32I-NEXT: slli t6, t5, 16 +; RV32I-NEXT: or t1, t4, a4 +; RV32I-NEXT: or t5, a7, t0 +; RV32I-NEXT: or a4, t6, t2 ; RV32I-NEXT: slli a4, a4, 6 -; RV32I-NEXT: srl s0, t1, a4 +; RV32I-NEXT: srl s1, t1, a4 ; RV32I-NEXT: neg s6, a4 ; RV32I-NEXT: sll t4, t5, s6 ; RV32I-NEXT: bltu a4, t3, .LBB14_2 @@ -4906,7 +4878,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: srl a7, t5, a4 ; RV32I-NEXT: j .LBB14_3 ; RV32I-NEXT: .LBB14_2: -; RV32I-NEXT: or a7, s0, t4 +; RV32I-NEXT: or a7, s1, t4 ; RV32I-NEXT: .LBB14_3: ; RV32I-NEXT: or t0, a6, a3 ; RV32I-NEXT: or a6, a1, a5 @@ -4920,11 +4892,11 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sw a3, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu a4, t3, .LBB14_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: li ra, 0 +; RV32I-NEXT: li s0, 0 ; RV32I-NEXT: srl a3, a6, a4 ; RV32I-NEXT: j .LBB14_8 ; RV32I-NEXT: .LBB14_7: -; RV32I-NEXT: srl ra, t5, a4 +; RV32I-NEXT: srl s0, t5, a4 ; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: .LBB14_8: ; RV32I-NEXT: li t6, 64 @@ -4961,29 +4933,29 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: mv t4, a3 ; RV32I-NEXT: .LBB14_18: ; RV32I-NEXT: neg s11, s9 -; RV32I-NEXT: sw s0, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s9, t3, .LBB14_20 ; RV32I-NEXT: # %bb.19: -; RV32I-NEXT: srl s2, t5, s9 +; RV32I-NEXT: srl s1, t5, s9 ; RV32I-NEXT: j .LBB14_21 ; RV32I-NEXT: .LBB14_20: ; RV32I-NEXT: sll a3, t5, s11 -; RV32I-NEXT: or s2, s0, a3 +; RV32I-NEXT: or s1, s1, a3 ; RV32I-NEXT: .LBB14_21: -; RV32I-NEXT: lbu s1, 11(a0) +; RV32I-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu s2, 11(a0) ; RV32I-NEXT: lbu a3, 15(a0) ; RV32I-NEXT: mv s0, t1 ; RV32I-NEXT: beqz s9, .LBB14_23 ; RV32I-NEXT: # %bb.22: -; RV32I-NEXT: mv s0, s2 +; RV32I-NEXT: mv s0, s1 ; RV32I-NEXT: .LBB14_23: -; RV32I-NEXT: lbu s4, 9(a0) -; RV32I-NEXT: lbu s2, 10(a0) -; RV32I-NEXT: lbu s5, 13(a0) -; RV32I-NEXT: lbu s8, 14(a0) -; RV32I-NEXT: slli s3, s1, 8 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s4, 10(a0) +; RV32I-NEXT: lbu s8, 13(a0) +; RV32I-NEXT: lbu ra, 14(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: sw ra, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s9, t3, .LBB14_25 ; RV32I-NEXT: # %bb.24: ; RV32I-NEXT: li s1, 0 @@ -4991,12 +4963,12 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: .LBB14_25: ; RV32I-NEXT: srl s1, t5, a4 ; RV32I-NEXT: .LBB14_26: -; RV32I-NEXT: or s2, s3, s2 -; RV32I-NEXT: lbu ra, 8(a0) -; RV32I-NEXT: lbu s3, 12(a0) -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: or s8, a3, s8 +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s2, s2, s4 +; RV32I-NEXT: lbu s5, 8(a0) +; RV32I-NEXT: lbu s4, 12(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or ra, a3, ra ; RV32I-NEXT: bgeu a4, t6, .LBB14_28 ; RV32I-NEXT: # %bb.27: ; RV32I-NEXT: or s0, a7, t2 @@ -5004,10 +4976,10 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: .LBB14_28: ; RV32I-NEXT: lbu a3, 3(a0) ; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: or a5, s4, ra +; RV32I-NEXT: or a5, s3, s5 ; RV32I-NEXT: slli t4, s2, 16 -; RV32I-NEXT: or s2, s5, s3 -; RV32I-NEXT: slli s3, s8, 16 +; RV32I-NEXT: or s2, s8, s4 +; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: mv s4, t0 ; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beqz a4, .LBB14_30 @@ -5015,25 +4987,25 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: mv s4, s0 ; RV32I-NEXT: mv a7, s1 ; RV32I-NEXT: .LBB14_30: -; RV32I-NEXT: slli s5, a3, 8 -; RV32I-NEXT: lbu ra, 1(a0) +; RV32I-NEXT: slli s3, a3, 8 +; RV32I-NEXT: lbu s8, 1(a0) ; RV32I-NEXT: lbu a3, 2(a0) ; RV32I-NEXT: lbu s1, 5(a0) ; RV32I-NEXT: lbu s0, 6(a0) -; RV32I-NEXT: slli s8, t2, 8 +; RV32I-NEXT: slli s5, t2, 8 ; RV32I-NEXT: or t4, t4, a5 -; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: or t2, ra, s2 ; RV32I-NEXT: bltu a4, t6, .LBB14_32 ; RV32I-NEXT: # %bb.31: ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: .LBB14_32: -; RV32I-NEXT: slli s3, ra, 8 -; RV32I-NEXT: or a5, s5, a3 -; RV32I-NEXT: lbu s5, 0(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or a5, s3, a3 +; RV32I-NEXT: lbu s3, 0(a0) ; RV32I-NEXT: lbu a0, 4(a0) ; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or a3, s8, s0 +; RV32I-NEXT: or a3, s5, s0 ; RV32I-NEXT: srl s2, t4, a4 ; RV32I-NEXT: sll ra, t2, s6 ; RV32I-NEXT: bltu a4, t3, .LBB14_34 @@ -5043,7 +5015,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: .LBB14_34: ; RV32I-NEXT: or s0, s2, ra ; RV32I-NEXT: .LBB14_35: -; RV32I-NEXT: or s3, s3, s5 +; RV32I-NEXT: or s3, s8, s3 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: slli a3, a3, 16 @@ -5784,54 +5756,54 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: lbu t0, 0(a1) -; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: or t0, t3, t2 +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t2, 1(a1) ; RV32I-NEXT: lbu t3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: li s9, 64 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t3 ; RV32I-NEXT: li t4, 32 ; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: or t3, a5, a4 -; RV32I-NEXT: or a5, t2, a7 -; RV32I-NEXT: or a4, a1, t0 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a4, a1, t1 ; RV32I-NEXT: slli a4, a4, 3 -; RV32I-NEXT: neg s10, a4 -; RV32I-NEXT: srl t5, t3, s10 -; RV32I-NEXT: sll s5, a5, a4 +; RV32I-NEXT: neg s5, a4 +; RV32I-NEXT: srl t5, t3, s5 +; RV32I-NEXT: sll s10, a5, a4 ; RV32I-NEXT: bltu a4, t4, .LBB15_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: li s8, 0 -; RV32I-NEXT: sll a7, t3, a4 +; RV32I-NEXT: sll t0, t3, a4 ; RV32I-NEXT: j .LBB15_3 ; RV32I-NEXT: .LBB15_2: ; RV32I-NEXT: sll s8, t3, a4 -; RV32I-NEXT: or a7, t5, s5 +; RV32I-NEXT: or t0, t5, s10 ; RV32I-NEXT: .LBB15_3: +; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: lbu t2, 9(a0) -; RV32I-NEXT: lbu a1, 10(a0) +; RV32I-NEXT: lbu a7, 10(a0) ; RV32I-NEXT: lbu t1, 13(a0) -; RV32I-NEXT: lbu t0, 14(a0) -; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: lbu a1, 14(a0) ; RV32I-NEXT: slli t6, a3, 8 ; RV32I-NEXT: sub s6, s9, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a4, .LBB15_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: mv a3, t0 ; RV32I-NEXT: .LBB15_5: -; RV32I-NEXT: slli a7, t2, 8 -; RV32I-NEXT: or a6, a6, a1 +; RV32I-NEXT: slli t0, t2, 8 +; RV32I-NEXT: or a6, a6, a7 ; RV32I-NEXT: lbu t2, 8(a0) -; RV32I-NEXT: lbu a1, 12(a0) +; RV32I-NEXT: lbu a7, 12(a0) ; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t6, t0 +; RV32I-NEXT: or a1, t6, a1 ; RV32I-NEXT: neg t6, s6 ; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s6, t4, .LBB15_7 @@ -5842,25 +5814,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sll t6, a5, t6 ; RV32I-NEXT: or t6, t5, t6 ; RV32I-NEXT: .LBB15_8: -; RV32I-NEXT: or a7, a7, t2 +; RV32I-NEXT: or t0, t0, t2 ; RV32I-NEXT: slli t2, a6, 16 -; RV32I-NEXT: or a1, t1, a1 -; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: or a7, t1, a7 +; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: mv a6, t3 ; RV32I-NEXT: beqz s6, .LBB15_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv a6, t6 ; RV32I-NEXT: .LBB15_10: -; RV32I-NEXT: or t1, t2, a7 -; RV32I-NEXT: or t2, t0, a1 +; RV32I-NEXT: or t1, t2, t0 +; RV32I-NEXT: or t2, a1, a7 ; RV32I-NEXT: bltu s6, t4, .LBB15_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li t0, 0 ; RV32I-NEXT: j .LBB15_13 ; RV32I-NEXT: .LBB15_12: -; RV32I-NEXT: srl a7, a5, s10 +; RV32I-NEXT: srl t0, a5, s5 ; RV32I-NEXT: .LBB15_13: -; RV32I-NEXT: srl s0, t1, s10 +; RV32I-NEXT: srl s0, t1, s5 ; RV32I-NEXT: sll a1, t2, a4 ; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu a4, t4, .LBB15_15 @@ -5890,7 +5862,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB15_20: ; RV32I-NEXT: sll s2, t3, a4 ; RV32I-NEXT: srl a1, t3, a1 -; RV32I-NEXT: or a1, a1, s5 +; RV32I-NEXT: or a1, a1, s10 ; RV32I-NEXT: mv s4, a5 ; RV32I-NEXT: beqz s7, .LBB15_22 ; RV32I-NEXT: .LBB15_21: @@ -5905,7 +5877,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB15_24: ; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill ; RV32I-NEXT: or s2, a6, s1 -; RV32I-NEXT: or s4, a7, s3 +; RV32I-NEXT: or s4, t0, s3 ; RV32I-NEXT: .LBB15_25: ; RV32I-NEXT: sub ra, a1, a4 ; RV32I-NEXT: mv a7, t1 @@ -5920,15 +5892,15 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: bltu ra, t4, .LBB15_29 ; RV32I-NEXT: # %bb.28: ; RV32I-NEXT: srl a1, t2, ra -; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, t1 ; RV32I-NEXT: bnez ra, .LBB15_30 ; RV32I-NEXT: j .LBB15_31 ; RV32I-NEXT: .LBB15_29: ; RV32I-NEXT: or a1, s0, s2 -; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, t1 ; RV32I-NEXT: beqz ra, .LBB15_31 ; RV32I-NEXT: .LBB15_30: -; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, a1 ; RV32I-NEXT: .LBB15_31: ; RV32I-NEXT: bltu ra, t4, .LBB15_33 ; RV32I-NEXT: # %bb.32: @@ -5938,7 +5910,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: bnez ra, .LBB15_34 ; RV32I-NEXT: j .LBB15_35 ; RV32I-NEXT: .LBB15_33: -; RV32I-NEXT: srl a1, t2, s10 +; RV32I-NEXT: srl a1, t2, s5 ; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sll a1, a5, s1 ; RV32I-NEXT: or a1, t5, a1 @@ -5959,7 +5931,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a1, a1, s2 ; RV32I-NEXT: j .LBB15_40 ; RV32I-NEXT: .LBB15_38: -; RV32I-NEXT: srl a1, a5, s10 +; RV32I-NEXT: srl a1, a5, s5 ; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s3, t4, .LBB15_37 ; RV32I-NEXT: .LBB15_39: @@ -5972,35 +5944,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: # %bb.41: ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: .LBB15_42: -; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s5, a7 +; RV32I-NEXT: sw t0, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, a7 ; RV32I-NEXT: bltu s4, t4, .LBB15_44 ; RV32I-NEXT: # %bb.43: -; RV32I-NEXT: srl t0, t2, s4 +; RV32I-NEXT: srl a7, t2, s4 ; RV32I-NEXT: j .LBB15_45 ; RV32I-NEXT: .LBB15_44: ; RV32I-NEXT: srl a1, t1, ra -; RV32I-NEXT: neg t0, s4 -; RV32I-NEXT: sll t0, t2, t0 -; RV32I-NEXT: or t0, a1, t0 +; RV32I-NEXT: neg a7, s4 +; RV32I-NEXT: sll a7, t2, a7 +; RV32I-NEXT: or a7, a1, a7 ; RV32I-NEXT: .LBB15_45: -; RV32I-NEXT: mv s0, s10 -; RV32I-NEXT: mv a7, a6 -; RV32I-NEXT: lbu s8, 19(a0) +; RV32I-NEXT: sw s10, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: li s0, 64 +; RV32I-NEXT: lbu t6, 19(a0) ; RV32I-NEXT: lbu a1, 23(a0) ; RV32I-NEXT: mv s3, t1 ; RV32I-NEXT: beqz s4, .LBB15_47 ; RV32I-NEXT: # %bb.46: -; RV32I-NEXT: mv s3, t0 +; RV32I-NEXT: mv s3, a7 ; RV32I-NEXT: .LBB15_47: -; RV32I-NEXT: mv a6, a3 -; RV32I-NEXT: lbu s10, 17(a0) -; RV32I-NEXT: lbu t0, 18(a0) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: lbu s11, 17(a0) +; RV32I-NEXT: lbu a7, 18(a0) ; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu t6, 22(a0) -; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: lbu s8, 22(a0) ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: li a3, 64 ; RV32I-NEXT: bltu s4, t4, .LBB15_49 ; RV32I-NEXT: # %bb.48: ; RV32I-NEXT: li s4, 0 @@ -6008,45 +5978,41 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB15_49: ; RV32I-NEXT: srl s4, t2, ra ; RV32I-NEXT: .LBB15_50: -; RV32I-NEXT: or s11, s8, t0 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or s10, t6, a7 +; RV32I-NEXT: lbu a7, 16(a0) +; RV32I-NEXT: lbu t6, 20(a0) ; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t6, a1, t6 -; RV32I-NEXT: bgeu ra, a3, .LBB15_52 +; RV32I-NEXT: or s8, a1, s8 +; RV32I-NEXT: bgeu ra, s0, .LBB15_52 ; RV32I-NEXT: # %bb.51: ; RV32I-NEXT: or s3, t5, s1 ; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload ; RV32I-NEXT: or s4, a1, s2 ; RV32I-NEXT: .LBB15_52: -; RV32I-NEXT: or a1, s10, t0 -; RV32I-NEXT: slli s11, s11, 16 -; RV32I-NEXT: or t0, s9, s8 -; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: or a1, s11, a7 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: or a7, s9, t6 +; RV32I-NEXT: slli s8, s8, 16 ; RV32I-NEXT: mv t5, t3 -; RV32I-NEXT: mv s1, a5 -; RV32I-NEXT: mv a3, a6 +; RV32I-NEXT: mv t6, a5 ; RV32I-NEXT: beqz ra, .LBB15_54 ; RV32I-NEXT: # %bb.53: ; RV32I-NEXT: mv t5, s3 -; RV32I-NEXT: mv s1, s4 +; RV32I-NEXT: mv t6, s4 ; RV32I-NEXT: .LBB15_54: -; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: or s2, s11, a1 -; RV32I-NEXT: or s1, t6, t0 +; RV32I-NEXT: or s2, s10, a1 +; RV32I-NEXT: or s1, s8, a7 ; RV32I-NEXT: li a1, 64 -; RV32I-NEXT: mv a6, a7 -; RV32I-NEXT: mv a7, s0 ; RV32I-NEXT: bltu ra, a1, .LBB15_56 ; RV32I-NEXT: # %bb.55: ; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: .LBB15_56: -; RV32I-NEXT: srl s3, s2, a7 -; RV32I-NEXT: sll ra, s1, a4 -; RV32I-NEXT: mv a7, s5 +; RV32I-NEXT: srl s3, s2, s5 +; RV32I-NEXT: sll s0, s1, a4 ; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw t6, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu a4, t4, .LBB15_58 ; RV32I-NEXT: # %bb.57: ; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill @@ -6055,54 +6021,54 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB15_58: ; RV32I-NEXT: sll a1, s2, a4 ; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: or a1, s3, ra +; RV32I-NEXT: or a1, s3, s0 ; RV32I-NEXT: .LBB15_59: -; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: lbu s11, 27(a0) ; RV32I-NEXT: lbu t6, 31(a0) ; RV32I-NEXT: mv t5, s1 ; RV32I-NEXT: beqz a4, .LBB15_61 ; RV32I-NEXT: # %bb.60: ; RV32I-NEXT: mv t5, a1 ; RV32I-NEXT: .LBB15_61: -; RV32I-NEXT: lbu s8, 25(a0) -; RV32I-NEXT: lbu s4, 26(a0) -; RV32I-NEXT: lbu s11, 29(a0) -; RV32I-NEXT: lbu s10, 30(a0) -; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: lbu s9, 25(a0) +; RV32I-NEXT: lbu s8, 26(a0) +; RV32I-NEXT: lbu s10, 29(a0) +; RV32I-NEXT: lbu s4, 30(a0) ; RV32I-NEXT: slli t6, t6, 8 ; RV32I-NEXT: bltu s6, t4, .LBB15_63 ; RV32I-NEXT: # %bb.62: -; RV32I-NEXT: srl t0, s1, s6 +; RV32I-NEXT: srl a7, s1, s6 ; RV32I-NEXT: j .LBB15_64 ; RV32I-NEXT: .LBB15_63: ; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: sll a1, s1, a1 -; RV32I-NEXT: or t0, s3, a1 +; RV32I-NEXT: or a7, s3, a1 ; RV32I-NEXT: .LBB15_64: -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: lbu s3, 24(a0) -; RV32I-NEXT: lbu a1, 28(a0) -; RV32I-NEXT: or s4, s9, s4 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: or t6, t6, s10 +; RV32I-NEXT: slli s3, s9, 8 +; RV32I-NEXT: or a1, s11, s8 +; RV32I-NEXT: lbu s11, 24(a0) +; RV32I-NEXT: lbu s8, 28(a0) +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: or t6, t6, s4 ; RV32I-NEXT: mv s9, s2 ; RV32I-NEXT: beqz s6, .LBB15_66 ; RV32I-NEXT: # %bb.65: -; RV32I-NEXT: mv s9, t0 +; RV32I-NEXT: mv s9, a7 ; RV32I-NEXT: .LBB15_66: -; RV32I-NEXT: or a0, s8, s3 -; RV32I-NEXT: slli t0, s4, 16 -; RV32I-NEXT: or a1, s11, a1 +; RV32I-NEXT: or a0, s3, s11 +; RV32I-NEXT: slli a7, a1, 16 +; RV32I-NEXT: or a1, s10, s8 ; RV32I-NEXT: slli t6, t6, 16 ; RV32I-NEXT: bltu s6, t4, .LBB15_68 ; RV32I-NEXT: # %bb.67: ; RV32I-NEXT: li s4, 0 ; RV32I-NEXT: j .LBB15_69 ; RV32I-NEXT: .LBB15_68: -; RV32I-NEXT: srl s4, s1, s0 +; RV32I-NEXT: srl s4, s1, s5 ; RV32I-NEXT: .LBB15_69: ; RV32I-NEXT: li s11, 64 -; RV32I-NEXT: or s6, t0, a0 +; RV32I-NEXT: or s6, a7, a0 ; RV32I-NEXT: or a0, t6, a1 ; RV32I-NEXT: bltu a4, t4, .LBB15_71 ; RV32I-NEXT: # %bb.70: @@ -6113,9 +6079,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: j .LBB15_73 ; RV32I-NEXT: .LBB15_71: ; RV32I-NEXT: sll s3, s6, a4 -; RV32I-NEXT: srl a1, s6, s0 -; RV32I-NEXT: sll t0, a0, a4 -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: srl a1, s6, s5 +; RV32I-NEXT: sll a7, a0, a4 +; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: mv s10, a0 ; RV32I-NEXT: beqz a4, .LBB15_73 ; RV32I-NEXT: .LBB15_72: @@ -6132,7 +6098,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sll s5, s2, a4 ; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: srl a1, s2, a1 -; RV32I-NEXT: or a1, a1, ra +; RV32I-NEXT: or a1, a1, s0 ; RV32I-NEXT: mv s0, s1 ; RV32I-NEXT: beqz s7, .LBB15_77 ; RV32I-NEXT: .LBB15_76: @@ -6196,8 +6162,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB15_93: ; RV32I-NEXT: sll s10, t1, a4 ; RV32I-NEXT: srl a1, t1, s3 -; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: lw a7, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: j .LBB15_96 ; RV32I-NEXT: .LBB15_94: ; RV32I-NEXT: srl s4, a5, s3 @@ -6223,8 +6189,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sll t4, t3, s9 ; RV32I-NEXT: neg a1, s11 ; RV32I-NEXT: srl a1, t3, a1 -; RV32I-NEXT: sll t0, a5, s9 -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: sll a7, a5, s9 +; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: beqz s11, .LBB15_102 ; RV32I-NEXT: .LBB15_101: ; RV32I-NEXT: mv a5, a1 @@ -6249,7 +6215,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: # %bb.107: ; RV32I-NEXT: li ra, 0 ; RV32I-NEXT: li a3, 0 -; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li t0, 0 ; RV32I-NEXT: li a6, 0 ; RV32I-NEXT: bnez a4, .LBB15_109 ; RV32I-NEXT: j .LBB15_110 @@ -6276,8 +6242,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli t1, ra, 24 ; RV32I-NEXT: srli a5, a3, 16 ; RV32I-NEXT: srli t4, a3, 24 -; RV32I-NEXT: srli t0, a7, 16 -; RV32I-NEXT: srli s0, a7, 24 +; RV32I-NEXT: srli a7, t0, 16 +; RV32I-NEXT: srli s0, t0, 24 ; RV32I-NEXT: srli t3, a6, 16 ; RV32I-NEXT: srli s3, a6, 24 ; RV32I-NEXT: srli t6, s2, 16 @@ -6296,7 +6262,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb s10, 1(a2) ; RV32I-NEXT: sb a4, 2(a2) ; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: and a4, a7, t2 +; RV32I-NEXT: and a4, t0, t2 ; RV32I-NEXT: srli t1, s11, 8 ; RV32I-NEXT: sb a3, 4(a2) ; RV32I-NEXT: sb t1, 5(a2) @@ -6304,9 +6270,9 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb t4, 7(a2) ; RV32I-NEXT: and a3, a6, t2 ; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb t0, 8(a2) ; RV32I-NEXT: sb a4, 9(a2) -; RV32I-NEXT: sb t0, 10(a2) +; RV32I-NEXT: sb a7, 10(a2) ; RV32I-NEXT: sb s0, 11(a2) ; RV32I-NEXT: and a4, s2, t2 ; RV32I-NEXT: srli a3, a3, 8 @@ -6698,54 +6664,54 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: lbu t0, 0(a1) -; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: or t0, t3, t2 +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t2, 1(a1) ; RV32I-NEXT: lbu t3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: li s9, 64 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t3 ; RV32I-NEXT: li t4, 32 ; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: or t3, a5, a4 -; RV32I-NEXT: or a5, t2, a7 -; RV32I-NEXT: or a4, a1, t0 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a4, a1, t1 ; RV32I-NEXT: slli a4, a4, 5 -; RV32I-NEXT: neg s10, a4 -; RV32I-NEXT: srl t5, t3, s10 -; RV32I-NEXT: sll s5, a5, a4 +; RV32I-NEXT: neg s5, a4 +; RV32I-NEXT: srl t5, t3, s5 +; RV32I-NEXT: sll s10, a5, a4 ; RV32I-NEXT: bltu a4, t4, .LBB16_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: li s8, 0 -; RV32I-NEXT: sll a7, t3, a4 +; RV32I-NEXT: sll t0, t3, a4 ; RV32I-NEXT: j .LBB16_3 ; RV32I-NEXT: .LBB16_2: ; RV32I-NEXT: sll s8, t3, a4 -; RV32I-NEXT: or a7, t5, s5 +; RV32I-NEXT: or t0, t5, s10 ; RV32I-NEXT: .LBB16_3: +; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: lbu t2, 9(a0) -; RV32I-NEXT: lbu a1, 10(a0) +; RV32I-NEXT: lbu a7, 10(a0) ; RV32I-NEXT: lbu t1, 13(a0) -; RV32I-NEXT: lbu t0, 14(a0) -; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: lbu a1, 14(a0) ; RV32I-NEXT: slli t6, a3, 8 ; RV32I-NEXT: sub s6, s9, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a4, .LBB16_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: mv a3, t0 ; RV32I-NEXT: .LBB16_5: -; RV32I-NEXT: slli a7, t2, 8 -; RV32I-NEXT: or a6, a6, a1 +; RV32I-NEXT: slli t0, t2, 8 +; RV32I-NEXT: or a6, a6, a7 ; RV32I-NEXT: lbu t2, 8(a0) -; RV32I-NEXT: lbu a1, 12(a0) +; RV32I-NEXT: lbu a7, 12(a0) ; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t6, t0 +; RV32I-NEXT: or a1, t6, a1 ; RV32I-NEXT: neg t6, s6 ; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s6, t4, .LBB16_7 @@ -6756,25 +6722,25 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sll t6, a5, t6 ; RV32I-NEXT: or t6, t5, t6 ; RV32I-NEXT: .LBB16_8: -; RV32I-NEXT: or a7, a7, t2 +; RV32I-NEXT: or t0, t0, t2 ; RV32I-NEXT: slli t2, a6, 16 -; RV32I-NEXT: or a1, t1, a1 -; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: or a7, t1, a7 +; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: mv a6, t3 ; RV32I-NEXT: beqz s6, .LBB16_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv a6, t6 ; RV32I-NEXT: .LBB16_10: -; RV32I-NEXT: or t1, t2, a7 -; RV32I-NEXT: or t2, t0, a1 +; RV32I-NEXT: or t1, t2, t0 +; RV32I-NEXT: or t2, a1, a7 ; RV32I-NEXT: bltu s6, t4, .LBB16_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li t0, 0 ; RV32I-NEXT: j .LBB16_13 ; RV32I-NEXT: .LBB16_12: -; RV32I-NEXT: srl a7, a5, s10 +; RV32I-NEXT: srl t0, a5, s5 ; RV32I-NEXT: .LBB16_13: -; RV32I-NEXT: srl s0, t1, s10 +; RV32I-NEXT: srl s0, t1, s5 ; RV32I-NEXT: sll a1, t2, a4 ; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu a4, t4, .LBB16_15 @@ -6804,7 +6770,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: .LBB16_20: ; RV32I-NEXT: sll s2, t3, a4 ; RV32I-NEXT: srl a1, t3, a1 -; RV32I-NEXT: or a1, a1, s5 +; RV32I-NEXT: or a1, a1, s10 ; RV32I-NEXT: mv s4, a5 ; RV32I-NEXT: beqz s7, .LBB16_22 ; RV32I-NEXT: .LBB16_21: @@ -6819,7 +6785,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: .LBB16_24: ; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill ; RV32I-NEXT: or s2, a6, s1 -; RV32I-NEXT: or s4, a7, s3 +; RV32I-NEXT: or s4, t0, s3 ; RV32I-NEXT: .LBB16_25: ; RV32I-NEXT: sub ra, a1, a4 ; RV32I-NEXT: mv a7, t1 @@ -6834,15 +6800,15 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: bltu ra, t4, .LBB16_29 ; RV32I-NEXT: # %bb.28: ; RV32I-NEXT: srl a1, t2, ra -; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, t1 ; RV32I-NEXT: bnez ra, .LBB16_30 ; RV32I-NEXT: j .LBB16_31 ; RV32I-NEXT: .LBB16_29: ; RV32I-NEXT: or a1, s0, s2 -; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, t1 ; RV32I-NEXT: beqz ra, .LBB16_31 ; RV32I-NEXT: .LBB16_30: -; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, a1 ; RV32I-NEXT: .LBB16_31: ; RV32I-NEXT: bltu ra, t4, .LBB16_33 ; RV32I-NEXT: # %bb.32: @@ -6852,7 +6818,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: bnez ra, .LBB16_34 ; RV32I-NEXT: j .LBB16_35 ; RV32I-NEXT: .LBB16_33: -; RV32I-NEXT: srl a1, t2, s10 +; RV32I-NEXT: srl a1, t2, s5 ; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sll a1, a5, s1 ; RV32I-NEXT: or a1, t5, a1 @@ -6873,7 +6839,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: or a1, a1, s2 ; RV32I-NEXT: j .LBB16_40 ; RV32I-NEXT: .LBB16_38: -; RV32I-NEXT: srl a1, a5, s10 +; RV32I-NEXT: srl a1, a5, s5 ; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s3, t4, .LBB16_37 ; RV32I-NEXT: .LBB16_39: @@ -6886,35 +6852,33 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: # %bb.41: ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: .LBB16_42: -; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s5, a7 +; RV32I-NEXT: sw t0, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, a7 ; RV32I-NEXT: bltu s4, t4, .LBB16_44 ; RV32I-NEXT: # %bb.43: -; RV32I-NEXT: srl t0, t2, s4 +; RV32I-NEXT: srl a7, t2, s4 ; RV32I-NEXT: j .LBB16_45 ; RV32I-NEXT: .LBB16_44: ; RV32I-NEXT: srl a1, t1, ra -; RV32I-NEXT: neg t0, s4 -; RV32I-NEXT: sll t0, t2, t0 -; RV32I-NEXT: or t0, a1, t0 +; RV32I-NEXT: neg a7, s4 +; RV32I-NEXT: sll a7, t2, a7 +; RV32I-NEXT: or a7, a1, a7 ; RV32I-NEXT: .LBB16_45: -; RV32I-NEXT: mv s0, s10 -; RV32I-NEXT: mv a7, a6 -; RV32I-NEXT: lbu s8, 19(a0) +; RV32I-NEXT: sw s10, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: li s0, 64 +; RV32I-NEXT: lbu t6, 19(a0) ; RV32I-NEXT: lbu a1, 23(a0) ; RV32I-NEXT: mv s3, t1 ; RV32I-NEXT: beqz s4, .LBB16_47 ; RV32I-NEXT: # %bb.46: -; RV32I-NEXT: mv s3, t0 +; RV32I-NEXT: mv s3, a7 ; RV32I-NEXT: .LBB16_47: -; RV32I-NEXT: mv a6, a3 -; RV32I-NEXT: lbu s10, 17(a0) -; RV32I-NEXT: lbu t0, 18(a0) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: lbu s11, 17(a0) +; RV32I-NEXT: lbu a7, 18(a0) ; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu t6, 22(a0) -; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: lbu s8, 22(a0) ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: li a3, 64 ; RV32I-NEXT: bltu s4, t4, .LBB16_49 ; RV32I-NEXT: # %bb.48: ; RV32I-NEXT: li s4, 0 @@ -6922,45 +6886,41 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: .LBB16_49: ; RV32I-NEXT: srl s4, t2, ra ; RV32I-NEXT: .LBB16_50: -; RV32I-NEXT: or s11, s8, t0 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or s10, t6, a7 +; RV32I-NEXT: lbu a7, 16(a0) +; RV32I-NEXT: lbu t6, 20(a0) ; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t6, a1, t6 -; RV32I-NEXT: bgeu ra, a3, .LBB16_52 +; RV32I-NEXT: or s8, a1, s8 +; RV32I-NEXT: bgeu ra, s0, .LBB16_52 ; RV32I-NEXT: # %bb.51: ; RV32I-NEXT: or s3, t5, s1 ; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload ; RV32I-NEXT: or s4, a1, s2 ; RV32I-NEXT: .LBB16_52: -; RV32I-NEXT: or a1, s10, t0 -; RV32I-NEXT: slli s11, s11, 16 -; RV32I-NEXT: or t0, s9, s8 -; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: or a1, s11, a7 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: or a7, s9, t6 +; RV32I-NEXT: slli s8, s8, 16 ; RV32I-NEXT: mv t5, t3 -; RV32I-NEXT: mv s1, a5 -; RV32I-NEXT: mv a3, a6 +; RV32I-NEXT: mv t6, a5 ; RV32I-NEXT: beqz ra, .LBB16_54 ; RV32I-NEXT: # %bb.53: ; RV32I-NEXT: mv t5, s3 -; RV32I-NEXT: mv s1, s4 +; RV32I-NEXT: mv t6, s4 ; RV32I-NEXT: .LBB16_54: -; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: or s2, s11, a1 -; RV32I-NEXT: or s1, t6, t0 +; RV32I-NEXT: or s2, s10, a1 +; RV32I-NEXT: or s1, s8, a7 ; RV32I-NEXT: li a1, 64 -; RV32I-NEXT: mv a6, a7 -; RV32I-NEXT: mv a7, s0 ; RV32I-NEXT: bltu ra, a1, .LBB16_56 ; RV32I-NEXT: # %bb.55: ; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: .LBB16_56: -; RV32I-NEXT: srl s3, s2, a7 -; RV32I-NEXT: sll ra, s1, a4 -; RV32I-NEXT: mv a7, s5 +; RV32I-NEXT: srl s3, s2, s5 +; RV32I-NEXT: sll s0, s1, a4 ; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw t6, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu a4, t4, .LBB16_58 ; RV32I-NEXT: # %bb.57: ; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill @@ -6969,54 +6929,54 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: .LBB16_58: ; RV32I-NEXT: sll a1, s2, a4 ; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: or a1, s3, ra +; RV32I-NEXT: or a1, s3, s0 ; RV32I-NEXT: .LBB16_59: -; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: lbu s11, 27(a0) ; RV32I-NEXT: lbu t6, 31(a0) ; RV32I-NEXT: mv t5, s1 ; RV32I-NEXT: beqz a4, .LBB16_61 ; RV32I-NEXT: # %bb.60: ; RV32I-NEXT: mv t5, a1 ; RV32I-NEXT: .LBB16_61: -; RV32I-NEXT: lbu s8, 25(a0) -; RV32I-NEXT: lbu s4, 26(a0) -; RV32I-NEXT: lbu s11, 29(a0) -; RV32I-NEXT: lbu s10, 30(a0) -; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: lbu s9, 25(a0) +; RV32I-NEXT: lbu s8, 26(a0) +; RV32I-NEXT: lbu s10, 29(a0) +; RV32I-NEXT: lbu s4, 30(a0) ; RV32I-NEXT: slli t6, t6, 8 ; RV32I-NEXT: bltu s6, t4, .LBB16_63 ; RV32I-NEXT: # %bb.62: -; RV32I-NEXT: srl t0, s1, s6 +; RV32I-NEXT: srl a7, s1, s6 ; RV32I-NEXT: j .LBB16_64 ; RV32I-NEXT: .LBB16_63: ; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: sll a1, s1, a1 -; RV32I-NEXT: or t0, s3, a1 +; RV32I-NEXT: or a7, s3, a1 ; RV32I-NEXT: .LBB16_64: -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: lbu s3, 24(a0) -; RV32I-NEXT: lbu a1, 28(a0) -; RV32I-NEXT: or s4, s9, s4 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: or t6, t6, s10 +; RV32I-NEXT: slli s3, s9, 8 +; RV32I-NEXT: or a1, s11, s8 +; RV32I-NEXT: lbu s11, 24(a0) +; RV32I-NEXT: lbu s8, 28(a0) +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: or t6, t6, s4 ; RV32I-NEXT: mv s9, s2 ; RV32I-NEXT: beqz s6, .LBB16_66 ; RV32I-NEXT: # %bb.65: -; RV32I-NEXT: mv s9, t0 +; RV32I-NEXT: mv s9, a7 ; RV32I-NEXT: .LBB16_66: -; RV32I-NEXT: or a0, s8, s3 -; RV32I-NEXT: slli t0, s4, 16 -; RV32I-NEXT: or a1, s11, a1 +; RV32I-NEXT: or a0, s3, s11 +; RV32I-NEXT: slli a7, a1, 16 +; RV32I-NEXT: or a1, s10, s8 ; RV32I-NEXT: slli t6, t6, 16 ; RV32I-NEXT: bltu s6, t4, .LBB16_68 ; RV32I-NEXT: # %bb.67: ; RV32I-NEXT: li s4, 0 ; RV32I-NEXT: j .LBB16_69 ; RV32I-NEXT: .LBB16_68: -; RV32I-NEXT: srl s4, s1, s0 +; RV32I-NEXT: srl s4, s1, s5 ; RV32I-NEXT: .LBB16_69: ; RV32I-NEXT: li s11, 64 -; RV32I-NEXT: or s6, t0, a0 +; RV32I-NEXT: or s6, a7, a0 ; RV32I-NEXT: or a0, t6, a1 ; RV32I-NEXT: bltu a4, t4, .LBB16_71 ; RV32I-NEXT: # %bb.70: @@ -7027,9 +6987,9 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: j .LBB16_73 ; RV32I-NEXT: .LBB16_71: ; RV32I-NEXT: sll s3, s6, a4 -; RV32I-NEXT: srl a1, s6, s0 -; RV32I-NEXT: sll t0, a0, a4 -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: srl a1, s6, s5 +; RV32I-NEXT: sll a7, a0, a4 +; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: mv s10, a0 ; RV32I-NEXT: beqz a4, .LBB16_73 ; RV32I-NEXT: .LBB16_72: @@ -7046,7 +7006,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sll s5, s2, a4 ; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: srl a1, s2, a1 -; RV32I-NEXT: or a1, a1, ra +; RV32I-NEXT: or a1, a1, s0 ; RV32I-NEXT: mv s0, s1 ; RV32I-NEXT: beqz s7, .LBB16_77 ; RV32I-NEXT: .LBB16_76: @@ -7110,8 +7070,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: .LBB16_93: ; RV32I-NEXT: sll s10, t1, a4 ; RV32I-NEXT: srl a1, t1, s3 -; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: lw a7, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: j .LBB16_96 ; RV32I-NEXT: .LBB16_94: ; RV32I-NEXT: srl s4, a5, s3 @@ -7137,8 +7097,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sll t4, t3, s9 ; RV32I-NEXT: neg a1, s11 ; RV32I-NEXT: srl a1, t3, a1 -; RV32I-NEXT: sll t0, a5, s9 -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: sll a7, a5, s9 +; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: beqz s11, .LBB16_102 ; RV32I-NEXT: .LBB16_101: ; RV32I-NEXT: mv a5, a1 @@ -7163,7 +7123,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: # %bb.107: ; RV32I-NEXT: li ra, 0 ; RV32I-NEXT: li a3, 0 -; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li t0, 0 ; RV32I-NEXT: li a6, 0 ; RV32I-NEXT: bnez a4, .LBB16_109 ; RV32I-NEXT: j .LBB16_110 @@ -7190,8 +7150,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: srli t1, ra, 24 ; RV32I-NEXT: srli a5, a3, 16 ; RV32I-NEXT: srli t4, a3, 24 -; RV32I-NEXT: srli t0, a7, 16 -; RV32I-NEXT: srli s0, a7, 24 +; RV32I-NEXT: srli a7, t0, 16 +; RV32I-NEXT: srli s0, t0, 24 ; RV32I-NEXT: srli t3, a6, 16 ; RV32I-NEXT: srli s3, a6, 24 ; RV32I-NEXT: srli t6, s2, 16 @@ -7210,7 +7170,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sb s10, 1(a2) ; RV32I-NEXT: sb a4, 2(a2) ; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: and a4, a7, t2 +; RV32I-NEXT: and a4, t0, t2 ; RV32I-NEXT: srli t1, s11, 8 ; RV32I-NEXT: sb a3, 4(a2) ; RV32I-NEXT: sb t1, 5(a2) @@ -7218,9 +7178,9 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sb t4, 7(a2) ; RV32I-NEXT: and a3, a6, t2 ; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb t0, 8(a2) ; RV32I-NEXT: sb a4, 9(a2) -; RV32I-NEXT: sb t0, 10(a2) +; RV32I-NEXT: sb a7, 10(a2) ; RV32I-NEXT: sb s0, 11(a2) ; RV32I-NEXT: and a4, s2, t2 ; RV32I-NEXT: srli a3, a3, 8 @@ -7612,54 +7572,54 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: slli t3, t3, 8 ; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: lbu t0, 0(a1) -; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: or t0, t3, t2 +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t2, 1(a1) ; RV32I-NEXT: lbu t3, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: li s9, 64 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, t3 ; RV32I-NEXT: li t4, 32 ; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: or t3, a5, a4 -; RV32I-NEXT: or a5, t2, a7 -; RV32I-NEXT: or a4, a1, t0 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a4, a1, t1 ; RV32I-NEXT: slli a4, a4, 6 -; RV32I-NEXT: neg s10, a4 -; RV32I-NEXT: srl t5, t3, s10 -; RV32I-NEXT: sll s5, a5, a4 +; RV32I-NEXT: neg s5, a4 +; RV32I-NEXT: srl t5, t3, s5 +; RV32I-NEXT: sll s10, a5, a4 ; RV32I-NEXT: bltu a4, t4, .LBB17_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: li s8, 0 -; RV32I-NEXT: sll a7, t3, a4 +; RV32I-NEXT: sll t0, t3, a4 ; RV32I-NEXT: j .LBB17_3 ; RV32I-NEXT: .LBB17_2: ; RV32I-NEXT: sll s8, t3, a4 -; RV32I-NEXT: or a7, t5, s5 +; RV32I-NEXT: or t0, t5, s10 ; RV32I-NEXT: .LBB17_3: +; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: lbu t2, 9(a0) -; RV32I-NEXT: lbu a1, 10(a0) +; RV32I-NEXT: lbu a7, 10(a0) ; RV32I-NEXT: lbu t1, 13(a0) -; RV32I-NEXT: lbu t0, 14(a0) -; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: lbu a1, 14(a0) ; RV32I-NEXT: slli t6, a3, 8 ; RV32I-NEXT: sub s6, s9, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: beqz a4, .LBB17_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: mv a3, t0 ; RV32I-NEXT: .LBB17_5: -; RV32I-NEXT: slli a7, t2, 8 -; RV32I-NEXT: or a6, a6, a1 +; RV32I-NEXT: slli t0, t2, 8 +; RV32I-NEXT: or a6, a6, a7 ; RV32I-NEXT: lbu t2, 8(a0) -; RV32I-NEXT: lbu a1, 12(a0) +; RV32I-NEXT: lbu a7, 12(a0) ; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t0, t6, t0 +; RV32I-NEXT: or a1, t6, a1 ; RV32I-NEXT: neg t6, s6 ; RV32I-NEXT: sw t6, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s6, t4, .LBB17_7 @@ -7670,25 +7630,25 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: sll t6, a5, t6 ; RV32I-NEXT: or t6, t5, t6 ; RV32I-NEXT: .LBB17_8: -; RV32I-NEXT: or a7, a7, t2 +; RV32I-NEXT: or t0, t0, t2 ; RV32I-NEXT: slli t2, a6, 16 -; RV32I-NEXT: or a1, t1, a1 -; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: or a7, t1, a7 +; RV32I-NEXT: slli a1, a1, 16 ; RV32I-NEXT: mv a6, t3 ; RV32I-NEXT: beqz s6, .LBB17_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv a6, t6 ; RV32I-NEXT: .LBB17_10: -; RV32I-NEXT: or t1, t2, a7 -; RV32I-NEXT: or t2, t0, a1 +; RV32I-NEXT: or t1, t2, t0 +; RV32I-NEXT: or t2, a1, a7 ; RV32I-NEXT: bltu s6, t4, .LBB17_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li t0, 0 ; RV32I-NEXT: j .LBB17_13 ; RV32I-NEXT: .LBB17_12: -; RV32I-NEXT: srl a7, a5, s10 +; RV32I-NEXT: srl t0, a5, s5 ; RV32I-NEXT: .LBB17_13: -; RV32I-NEXT: srl s0, t1, s10 +; RV32I-NEXT: srl s0, t1, s5 ; RV32I-NEXT: sll a1, t2, a4 ; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu a4, t4, .LBB17_15 @@ -7718,7 +7678,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: .LBB17_20: ; RV32I-NEXT: sll s2, t3, a4 ; RV32I-NEXT: srl a1, t3, a1 -; RV32I-NEXT: or a1, a1, s5 +; RV32I-NEXT: or a1, a1, s10 ; RV32I-NEXT: mv s4, a5 ; RV32I-NEXT: beqz s7, .LBB17_22 ; RV32I-NEXT: .LBB17_21: @@ -7733,7 +7693,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: .LBB17_24: ; RV32I-NEXT: sw s8, 28(sp) # 4-byte Folded Spill ; RV32I-NEXT: or s2, a6, s1 -; RV32I-NEXT: or s4, a7, s3 +; RV32I-NEXT: or s4, t0, s3 ; RV32I-NEXT: .LBB17_25: ; RV32I-NEXT: sub ra, a1, a4 ; RV32I-NEXT: mv a7, t1 @@ -7748,15 +7708,15 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: bltu ra, t4, .LBB17_29 ; RV32I-NEXT: # %bb.28: ; RV32I-NEXT: srl a1, t2, ra -; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, t1 ; RV32I-NEXT: bnez ra, .LBB17_30 ; RV32I-NEXT: j .LBB17_31 ; RV32I-NEXT: .LBB17_29: ; RV32I-NEXT: or a1, s0, s2 -; RV32I-NEXT: sw t1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, t1 ; RV32I-NEXT: beqz ra, .LBB17_31 ; RV32I-NEXT: .LBB17_30: -; RV32I-NEXT: sw a1, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, a1 ; RV32I-NEXT: .LBB17_31: ; RV32I-NEXT: bltu ra, t4, .LBB17_33 ; RV32I-NEXT: # %bb.32: @@ -7766,7 +7726,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: bnez ra, .LBB17_34 ; RV32I-NEXT: j .LBB17_35 ; RV32I-NEXT: .LBB17_33: -; RV32I-NEXT: srl a1, t2, s10 +; RV32I-NEXT: srl a1, t2, s5 ; RV32I-NEXT: sw a1, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sll a1, a5, s1 ; RV32I-NEXT: or a1, t5, a1 @@ -7787,7 +7747,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: or a1, a1, s2 ; RV32I-NEXT: j .LBB17_40 ; RV32I-NEXT: .LBB17_38: -; RV32I-NEXT: srl a1, a5, s10 +; RV32I-NEXT: srl a1, a5, s5 ; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s3, t4, .LBB17_37 ; RV32I-NEXT: .LBB17_39: @@ -7800,35 +7760,33 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: # %bb.41: ; RV32I-NEXT: mv s2, a1 ; RV32I-NEXT: .LBB17_42: -; RV32I-NEXT: sw s5, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s5, a7 +; RV32I-NEXT: sw t0, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv t0, a7 ; RV32I-NEXT: bltu s4, t4, .LBB17_44 ; RV32I-NEXT: # %bb.43: -; RV32I-NEXT: srl t0, t2, s4 +; RV32I-NEXT: srl a7, t2, s4 ; RV32I-NEXT: j .LBB17_45 ; RV32I-NEXT: .LBB17_44: ; RV32I-NEXT: srl a1, t1, ra -; RV32I-NEXT: neg t0, s4 -; RV32I-NEXT: sll t0, t2, t0 -; RV32I-NEXT: or t0, a1, t0 +; RV32I-NEXT: neg a7, s4 +; RV32I-NEXT: sll a7, t2, a7 +; RV32I-NEXT: or a7, a1, a7 ; RV32I-NEXT: .LBB17_45: -; RV32I-NEXT: mv s0, s10 -; RV32I-NEXT: mv a7, a6 -; RV32I-NEXT: lbu s8, 19(a0) +; RV32I-NEXT: sw s10, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: li s0, 64 +; RV32I-NEXT: lbu t6, 19(a0) ; RV32I-NEXT: lbu a1, 23(a0) ; RV32I-NEXT: mv s3, t1 ; RV32I-NEXT: beqz s4, .LBB17_47 ; RV32I-NEXT: # %bb.46: -; RV32I-NEXT: mv s3, t0 +; RV32I-NEXT: mv s3, a7 ; RV32I-NEXT: .LBB17_47: -; RV32I-NEXT: mv a6, a3 -; RV32I-NEXT: lbu s10, 17(a0) -; RV32I-NEXT: lbu t0, 18(a0) +; RV32I-NEXT: slli t6, t6, 8 +; RV32I-NEXT: lbu s11, 17(a0) +; RV32I-NEXT: lbu a7, 18(a0) ; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu t6, 22(a0) -; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: lbu s8, 22(a0) ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: li a3, 64 ; RV32I-NEXT: bltu s4, t4, .LBB17_49 ; RV32I-NEXT: # %bb.48: ; RV32I-NEXT: li s4, 0 @@ -7836,45 +7794,41 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: .LBB17_49: ; RV32I-NEXT: srl s4, t2, ra ; RV32I-NEXT: .LBB17_50: -; RV32I-NEXT: or s11, s8, t0 -; RV32I-NEXT: lbu t0, 16(a0) -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or s10, t6, a7 +; RV32I-NEXT: lbu a7, 16(a0) +; RV32I-NEXT: lbu t6, 20(a0) ; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: or t6, a1, t6 -; RV32I-NEXT: bgeu ra, a3, .LBB17_52 +; RV32I-NEXT: or s8, a1, s8 +; RV32I-NEXT: bgeu ra, s0, .LBB17_52 ; RV32I-NEXT: # %bb.51: ; RV32I-NEXT: or s3, t5, s1 ; RV32I-NEXT: lw a1, 32(sp) # 4-byte Folded Reload ; RV32I-NEXT: or s4, a1, s2 ; RV32I-NEXT: .LBB17_52: -; RV32I-NEXT: or a1, s10, t0 -; RV32I-NEXT: slli s11, s11, 16 -; RV32I-NEXT: or t0, s9, s8 -; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: or a1, s11, a7 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: or a7, s9, t6 +; RV32I-NEXT: slli s8, s8, 16 ; RV32I-NEXT: mv t5, t3 -; RV32I-NEXT: mv s1, a5 -; RV32I-NEXT: mv a3, a6 +; RV32I-NEXT: mv t6, a5 ; RV32I-NEXT: beqz ra, .LBB17_54 ; RV32I-NEXT: # %bb.53: ; RV32I-NEXT: mv t5, s3 -; RV32I-NEXT: mv s1, s4 +; RV32I-NEXT: mv t6, s4 ; RV32I-NEXT: .LBB17_54: -; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: or s2, s11, a1 -; RV32I-NEXT: or s1, t6, t0 +; RV32I-NEXT: or s2, s10, a1 +; RV32I-NEXT: or s1, s8, a7 ; RV32I-NEXT: li a1, 64 -; RV32I-NEXT: mv a6, a7 -; RV32I-NEXT: mv a7, s0 ; RV32I-NEXT: bltu ra, a1, .LBB17_56 ; RV32I-NEXT: # %bb.55: ; RV32I-NEXT: sw zero, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw zero, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: .LBB17_56: -; RV32I-NEXT: srl s3, s2, a7 -; RV32I-NEXT: sll ra, s1, a4 -; RV32I-NEXT: mv a7, s5 +; RV32I-NEXT: srl s3, s2, s5 +; RV32I-NEXT: sll s0, s1, a4 ; RV32I-NEXT: sw t5, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw t6, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu a4, t4, .LBB17_58 ; RV32I-NEXT: # %bb.57: ; RV32I-NEXT: sw zero, 32(sp) # 4-byte Folded Spill @@ -7883,54 +7837,54 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: .LBB17_58: ; RV32I-NEXT: sll a1, s2, a4 ; RV32I-NEXT: sw a1, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: or a1, s3, ra +; RV32I-NEXT: or a1, s3, s0 ; RV32I-NEXT: .LBB17_59: -; RV32I-NEXT: lbu s9, 27(a0) +; RV32I-NEXT: lbu s11, 27(a0) ; RV32I-NEXT: lbu t6, 31(a0) ; RV32I-NEXT: mv t5, s1 ; RV32I-NEXT: beqz a4, .LBB17_61 ; RV32I-NEXT: # %bb.60: ; RV32I-NEXT: mv t5, a1 ; RV32I-NEXT: .LBB17_61: -; RV32I-NEXT: lbu s8, 25(a0) -; RV32I-NEXT: lbu s4, 26(a0) -; RV32I-NEXT: lbu s11, 29(a0) -; RV32I-NEXT: lbu s10, 30(a0) -; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: lbu s9, 25(a0) +; RV32I-NEXT: lbu s8, 26(a0) +; RV32I-NEXT: lbu s10, 29(a0) +; RV32I-NEXT: lbu s4, 30(a0) ; RV32I-NEXT: slli t6, t6, 8 ; RV32I-NEXT: bltu s6, t4, .LBB17_63 ; RV32I-NEXT: # %bb.62: -; RV32I-NEXT: srl t0, s1, s6 +; RV32I-NEXT: srl a7, s1, s6 ; RV32I-NEXT: j .LBB17_64 ; RV32I-NEXT: .LBB17_63: ; RV32I-NEXT: lw a1, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: sll a1, s1, a1 -; RV32I-NEXT: or t0, s3, a1 +; RV32I-NEXT: or a7, s3, a1 ; RV32I-NEXT: .LBB17_64: -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: lbu s3, 24(a0) -; RV32I-NEXT: lbu a1, 28(a0) -; RV32I-NEXT: or s4, s9, s4 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: or t6, t6, s10 +; RV32I-NEXT: slli s3, s9, 8 +; RV32I-NEXT: or a1, s11, s8 +; RV32I-NEXT: lbu s11, 24(a0) +; RV32I-NEXT: lbu s8, 28(a0) +; RV32I-NEXT: slli s10, s10, 8 +; RV32I-NEXT: or t6, t6, s4 ; RV32I-NEXT: mv s9, s2 ; RV32I-NEXT: beqz s6, .LBB17_66 ; RV32I-NEXT: # %bb.65: -; RV32I-NEXT: mv s9, t0 +; RV32I-NEXT: mv s9, a7 ; RV32I-NEXT: .LBB17_66: -; RV32I-NEXT: or a0, s8, s3 -; RV32I-NEXT: slli t0, s4, 16 -; RV32I-NEXT: or a1, s11, a1 +; RV32I-NEXT: or a0, s3, s11 +; RV32I-NEXT: slli a7, a1, 16 +; RV32I-NEXT: or a1, s10, s8 ; RV32I-NEXT: slli t6, t6, 16 ; RV32I-NEXT: bltu s6, t4, .LBB17_68 ; RV32I-NEXT: # %bb.67: ; RV32I-NEXT: li s4, 0 ; RV32I-NEXT: j .LBB17_69 ; RV32I-NEXT: .LBB17_68: -; RV32I-NEXT: srl s4, s1, s0 +; RV32I-NEXT: srl s4, s1, s5 ; RV32I-NEXT: .LBB17_69: ; RV32I-NEXT: li s11, 64 -; RV32I-NEXT: or s6, t0, a0 +; RV32I-NEXT: or s6, a7, a0 ; RV32I-NEXT: or a0, t6, a1 ; RV32I-NEXT: bltu a4, t4, .LBB17_71 ; RV32I-NEXT: # %bb.70: @@ -7941,9 +7895,9 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: j .LBB17_73 ; RV32I-NEXT: .LBB17_71: ; RV32I-NEXT: sll s3, s6, a4 -; RV32I-NEXT: srl a1, s6, s0 -; RV32I-NEXT: sll t0, a0, a4 -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: srl a1, s6, s5 +; RV32I-NEXT: sll a7, a0, a4 +; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: mv s10, a0 ; RV32I-NEXT: beqz a4, .LBB17_73 ; RV32I-NEXT: .LBB17_72: @@ -7960,7 +7914,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: sll s5, s2, a4 ; RV32I-NEXT: lw a1, 16(sp) # 4-byte Folded Reload ; RV32I-NEXT: srl a1, s2, a1 -; RV32I-NEXT: or a1, a1, ra +; RV32I-NEXT: or a1, a1, s0 ; RV32I-NEXT: mv s0, s1 ; RV32I-NEXT: beqz s7, .LBB17_77 ; RV32I-NEXT: .LBB17_76: @@ -8024,8 +7978,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: .LBB17_93: ; RV32I-NEXT: sll s10, t1, a4 ; RV32I-NEXT: srl a1, t1, s3 -; RV32I-NEXT: lw t0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: lw a7, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: j .LBB17_96 ; RV32I-NEXT: .LBB17_94: ; RV32I-NEXT: srl s4, a5, s3 @@ -8051,8 +8005,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: sll t4, t3, s9 ; RV32I-NEXT: neg a1, s11 ; RV32I-NEXT: srl a1, t3, a1 -; RV32I-NEXT: sll t0, a5, s9 -; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: sll a7, a5, s9 +; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: beqz s11, .LBB17_102 ; RV32I-NEXT: .LBB17_101: ; RV32I-NEXT: mv a5, a1 @@ -8077,7 +8031,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: # %bb.107: ; RV32I-NEXT: li ra, 0 ; RV32I-NEXT: li a3, 0 -; RV32I-NEXT: li a7, 0 +; RV32I-NEXT: li t0, 0 ; RV32I-NEXT: li a6, 0 ; RV32I-NEXT: bnez a4, .LBB17_109 ; RV32I-NEXT: j .LBB17_110 @@ -8104,8 +8058,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: srli t1, ra, 24 ; RV32I-NEXT: srli a5, a3, 16 ; RV32I-NEXT: srli t4, a3, 24 -; RV32I-NEXT: srli t0, a7, 16 -; RV32I-NEXT: srli s0, a7, 24 +; RV32I-NEXT: srli a7, t0, 16 +; RV32I-NEXT: srli s0, t0, 24 ; RV32I-NEXT: srli t3, a6, 16 ; RV32I-NEXT: srli s3, a6, 24 ; RV32I-NEXT: srli t6, s2, 16 @@ -8124,7 +8078,7 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: sb s10, 1(a2) ; RV32I-NEXT: sb a4, 2(a2) ; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: and a4, a7, t2 +; RV32I-NEXT: and a4, t0, t2 ; RV32I-NEXT: srli t1, s11, 8 ; RV32I-NEXT: sb a3, 4(a2) ; RV32I-NEXT: sb t1, 5(a2) @@ -8132,9 +8086,9 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: sb t4, 7(a2) ; RV32I-NEXT: and a3, a6, t2 ; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb t0, 8(a2) ; RV32I-NEXT: sb a4, 9(a2) -; RV32I-NEXT: sb t0, 10(a2) +; RV32I-NEXT: sb a7, 10(a2) ; RV32I-NEXT: sb s0, 11(a2) ; RV32I-NEXT: and a4, s2, t2 ; RV32I-NEXT: srli a3, a3, 8 @@ -8227,88 +8181,88 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: lbu s8, 20(a0) ; RV64I-NEXT: lbu s9, 21(a0) ; RV64I-NEXT: lbu s10, 22(a0) ; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 ; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli s3, s3, 8 ; RV64I-NEXT: or a4, t4, t3 ; RV64I-NEXT: or a6, t6, t5 -; RV64I-NEXT: or t0, s1, s0 -; RV64I-NEXT: lbu t5, 24(a0) -; RV64I-NEXT: lbu t6, 25(a0) -; RV64I-NEXT: lbu s0, 26(a0) -; RV64I-NEXT: lbu s1, 27(a0) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: or t4, s3, s2 -; RV64I-NEXT: or t2, s5, s4 -; RV64I-NEXT: or t3, s7, s6 -; RV64I-NEXT: lbu s2, 28(a0) -; RV64I-NEXT: lbu s3, 29(a0) -; RV64I-NEXT: lbu s4, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: slli s4, s4, 8 ; RV64I-NEXT: or s6, s11, s10 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 ; RV64I-NEXT: lbu t6, 0(a1) -; RV64I-NEXT: lbu s1, 1(a1) -; RV64I-NEXT: lbu s7, 2(a1) -; RV64I-NEXT: lbu s8, 3(a1) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: or s3, a0, s4 -; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or s5, a0, s5 +; RV64I-NEXT: or t6, s3, t6 +; RV64I-NEXT: or s3, s7, s4 ; RV64I-NEXT: lbu a0, 4(a1) -; RV64I-NEXT: lbu s1, 5(a1) -; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: or s7, s8, s7 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, a0 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or s4, a1, s4 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: or a1, t1, a7 -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: or a0, t4, t0 -; RV64I-NEXT: slli t3, t3, 16 -; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: or s7, a1, s7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a1, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a0, t2, t1 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: or s0, s1, s0 ; RV64I-NEXT: slli s6, s6, 16 -; RV64I-NEXT: or t1, s6, s5 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: or t4, s0, t5 +; RV64I-NEXT: or a7, s6, s2 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: or t4, s5, t5 ; RV64I-NEXT: slli s3, s3, 16 -; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: or t5, s3, t6 ; RV64I-NEXT: slli s7, s7, 16 -; RV64I-NEXT: or t6, s7, t6 -; RV64I-NEXT: slli s4, s4, 16 -; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: or t6, s7, s4 ; RV64I-NEXT: li t0, 64 ; RV64I-NEXT: slli t3, a5, 16 ; RV64I-NEXT: slli t2, a6, 16 -; RV64I-NEXT: slli t1, t1, 32 -; RV64I-NEXT: slli t5, t5, 32 -; RV64I-NEXT: slli s0, s0, 32 -; RV64I-NEXT: or a7, t1, a7 -; RV64I-NEXT: or a5, t5, t4 -; RV64I-NEXT: or a6, s0, t6 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli t4, t4, 32 +; RV64I-NEXT: slli t6, t6, 32 +; RV64I-NEXT: or a7, a7, s0 +; RV64I-NEXT: or a5, t4, t1 +; RV64I-NEXT: or a6, t6, t5 ; RV64I-NEXT: slli a6, a6, 3 ; RV64I-NEXT: subw t1, a6, t0 ; RV64I-NEXT: negw t5, a6 @@ -8522,47 +8476,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t0, 21(a0) ; RV32I-NEXT: lbu t1, 22(a0) ; RV32I-NEXT: lbu t2, 23(a0) -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu t4, 25(a0) -; RV32I-NEXT: lbu t5, 26(a0) -; RV32I-NEXT: lbu t6, 27(a0) ; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 24(a0) +; RV32I-NEXT: lbu t3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) ; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a5, t2, t1 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a4, t3, a4 ; RV32I-NEXT: lbu a7, 28(a0) -; RV32I-NEXT: lbu t0, 29(a0) -; RV32I-NEXT: lbu t1, 30(a0) -; RV32I-NEXT: lbu t2, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: or t4, t6, t5 -; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: lbu t1, 29(a0) +; RV32I-NEXT: lbu t2, 30(a0) +; RV32I-NEXT: lbu t3, 31(a0) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t1, t1, a7 +; RV32I-NEXT: or t2, t3, t2 ; RV32I-NEXT: lbu a7, 0(a1) -; RV32I-NEXT: lbu t5, 1(a1) -; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu t3, 1(a1) +; RV32I-NEXT: lbu t5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or s0, t5, a7 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t3, t3, a7 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: or t6, a1, t5 ; RV32I-NEXT: li t5, 32 -; RV32I-NEXT: slli a7, a4, 16 -; RV32I-NEXT: slli a1, a5, 16 +; RV32I-NEXT: slli a7, a5, 16 +; RV32I-NEXT: slli a1, t0, 16 ; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli a5, t2, 16 -; RV32I-NEXT: or t2, t4, t3 -; RV32I-NEXT: or a4, t1, t0 -; RV32I-NEXT: or a5, a5, s0 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: or t2, t4, a4 +; RV32I-NEXT: or a4, a5, t1 +; RV32I-NEXT: or a5, t6, t3 ; RV32I-NEXT: slli a5, a5, 3 ; RV32I-NEXT: srl s0, t2, a5 ; RV32I-NEXT: neg s6, a5 @@ -8628,6 +8582,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB18_18: ; RV32I-NEXT: neg s11, s9 ; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s9, t5, .LBB18_20 ; RV32I-NEXT: # %bb.19: ; RV32I-NEXT: sra s0, a4, s9 @@ -8636,20 +8591,19 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sll a3, a4, s11 ; RV32I-NEXT: or s0, s0, a3 ; RV32I-NEXT: .LBB18_21: -; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu s3, 11(a0) +; RV32I-NEXT: lbu s1, 11(a0) ; RV32I-NEXT: lbu a3, 15(a0) ; RV32I-NEXT: mv t4, t2 ; RV32I-NEXT: beqz s9, .LBB18_23 ; RV32I-NEXT: # %bb.22: ; RV32I-NEXT: mv t4, s0 ; RV32I-NEXT: .LBB18_23: +; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: lbu s2, 9(a0) -; RV32I-NEXT: lbu s1, 10(a0) +; RV32I-NEXT: lbu s3, 10(a0) ; RV32I-NEXT: lbu s8, 13(a0) ; RV32I-NEXT: lbu ra, 14(a0) -; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: bltu s9, t5, .LBB18_25 ; RV32I-NEXT: # %bb.24: @@ -8658,12 +8612,12 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB18_25: ; RV32I-NEXT: sra s0, a4, a5 ; RV32I-NEXT: .LBB18_26: -; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: or s1, s1, s3 ; RV32I-NEXT: lbu s5, 8(a0) ; RV32I-NEXT: lbu s3, 12(a0) -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s4, s8, 8 -; RV32I-NEXT: or s8, a3, ra +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or ra, a3, ra ; RV32I-NEXT: bgeu a5, t6, .LBB18_28 ; RV32I-NEXT: # %bb.27: ; RV32I-NEXT: or t4, t0, a6 @@ -8673,8 +8627,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t3, 7(a0) ; RV32I-NEXT: or a6, s2, s5 ; RV32I-NEXT: slli s2, s1, 16 -; RV32I-NEXT: or s1, s4, s3 -; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: or s1, s8, s3 +; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: mv a1, t1 ; RV32I-NEXT: mv t0, a7 ; RV32I-NEXT: beqz a5, .LBB18_30 @@ -8682,26 +8636,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: mv a1, t4 ; RV32I-NEXT: mv t0, s0 ; RV32I-NEXT: .LBB18_30: -; RV32I-NEXT: slli s5, a3, 8 -; RV32I-NEXT: lbu ra, 1(a0) -; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: slli s8, a3, 8 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu s4, 2(a0) ; RV32I-NEXT: lbu s3, 5(a0) ; RV32I-NEXT: lbu s0, 6(a0) -; RV32I-NEXT: slli s4, t3, 8 +; RV32I-NEXT: slli s5, t3, 8 ; RV32I-NEXT: or t4, s2, a6 -; RV32I-NEXT: or t3, s8, s1 +; RV32I-NEXT: or t3, ra, s1 ; RV32I-NEXT: bltu a5, t6, .LBB18_32 ; RV32I-NEXT: # %bb.31: ; RV32I-NEXT: srai a6, a4, 31 ; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: .LBB18_32: -; RV32I-NEXT: slli a6, ra, 8 -; RV32I-NEXT: or a3, s5, a3 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a6, s8, s4 ; RV32I-NEXT: lbu s1, 0(a0) ; RV32I-NEXT: lbu a0, 4(a0) ; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s0, s4, s0 +; RV32I-NEXT: or s0, s5, s0 ; RV32I-NEXT: srl s2, t4, a5 ; RV32I-NEXT: sll ra, t3, s6 ; RV32I-NEXT: bltu a5, t5, .LBB18_34 @@ -8711,8 +8665,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: .LBB18_34: ; RV32I-NEXT: or s4, s2, ra ; RV32I-NEXT: .LBB18_35: -; RV32I-NEXT: or a6, a6, s1 -; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: or a3, a3, s1 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: or a0, s3, a0 ; RV32I-NEXT: slli s1, s0, 16 ; RV32I-NEXT: mv s5, t4 @@ -8720,7 +8674,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: # %bb.36: ; RV32I-NEXT: mv s5, s4 ; RV32I-NEXT: .LBB18_37: -; RV32I-NEXT: or s0, a3, a6 +; RV32I-NEXT: or s0, a6, a3 ; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: bltu a5, t5, .LBB18_39 ; RV32I-NEXT: # %bb.38: @@ -9158,88 +9112,88 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: lbu s8, 20(a0) ; RV64I-NEXT: lbu s9, 21(a0) ; RV64I-NEXT: lbu s10, 22(a0) ; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 ; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli s3, s3, 8 ; RV64I-NEXT: or a4, t4, t3 ; RV64I-NEXT: or a6, t6, t5 -; RV64I-NEXT: or t0, s1, s0 -; RV64I-NEXT: lbu t5, 24(a0) -; RV64I-NEXT: lbu t6, 25(a0) -; RV64I-NEXT: lbu s0, 26(a0) -; RV64I-NEXT: lbu s1, 27(a0) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: or t4, s3, s2 -; RV64I-NEXT: or t2, s5, s4 -; RV64I-NEXT: or t3, s7, s6 -; RV64I-NEXT: lbu s2, 28(a0) -; RV64I-NEXT: lbu s3, 29(a0) -; RV64I-NEXT: lbu s4, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: slli s4, s4, 8 ; RV64I-NEXT: or s6, s11, s10 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 ; RV64I-NEXT: lbu t6, 0(a1) -; RV64I-NEXT: lbu s1, 1(a1) -; RV64I-NEXT: lbu s7, 2(a1) -; RV64I-NEXT: lbu s8, 3(a1) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: or s3, a0, s4 -; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or s5, a0, s5 +; RV64I-NEXT: or t6, s3, t6 +; RV64I-NEXT: or s3, s7, s4 ; RV64I-NEXT: lbu a0, 4(a1) -; RV64I-NEXT: lbu s1, 5(a1) -; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: or s7, s8, s7 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, a0 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or s4, a1, s4 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: or a1, t1, a7 -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: or a0, t4, t0 -; RV64I-NEXT: slli t3, t3, 16 -; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: or s7, a1, s7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a1, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a0, t2, t1 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: or s0, s1, s0 ; RV64I-NEXT: slli s6, s6, 16 -; RV64I-NEXT: or t1, s6, s5 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: or t4, s0, t5 +; RV64I-NEXT: or a7, s6, s2 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: or t4, s5, t5 ; RV64I-NEXT: slli s3, s3, 16 -; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: or t5, s3, t6 ; RV64I-NEXT: slli s7, s7, 16 -; RV64I-NEXT: or t6, s7, t6 -; RV64I-NEXT: slli s4, s4, 16 -; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: or t6, s7, s4 ; RV64I-NEXT: li t0, 64 ; RV64I-NEXT: slli t3, a5, 16 ; RV64I-NEXT: slli t2, a6, 16 -; RV64I-NEXT: slli t1, t1, 32 -; RV64I-NEXT: slli t5, t5, 32 -; RV64I-NEXT: slli s0, s0, 32 -; RV64I-NEXT: or a7, t1, a7 -; RV64I-NEXT: or a5, t5, t4 -; RV64I-NEXT: or a6, s0, t6 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli t4, t4, 32 +; RV64I-NEXT: slli t6, t6, 32 +; RV64I-NEXT: or a7, a7, s0 +; RV64I-NEXT: or a5, t4, t1 +; RV64I-NEXT: or a6, t6, t5 ; RV64I-NEXT: slli a6, a6, 5 ; RV64I-NEXT: subw t1, a6, t0 ; RV64I-NEXT: negw t5, a6 @@ -9453,47 +9407,47 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: lbu t0, 21(a0) ; RV32I-NEXT: lbu t1, 22(a0) ; RV32I-NEXT: lbu t2, 23(a0) -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu t4, 25(a0) -; RV32I-NEXT: lbu t5, 26(a0) -; RV32I-NEXT: lbu t6, 27(a0) ; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 24(a0) +; RV32I-NEXT: lbu t3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) ; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a5, t2, t1 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a4, t3, a4 ; RV32I-NEXT: lbu a7, 28(a0) -; RV32I-NEXT: lbu t0, 29(a0) -; RV32I-NEXT: lbu t1, 30(a0) -; RV32I-NEXT: lbu t2, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: or t4, t6, t5 -; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: lbu t1, 29(a0) +; RV32I-NEXT: lbu t2, 30(a0) +; RV32I-NEXT: lbu t3, 31(a0) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t1, t1, a7 +; RV32I-NEXT: or t2, t3, t2 ; RV32I-NEXT: lbu a7, 0(a1) -; RV32I-NEXT: lbu t5, 1(a1) -; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu t3, 1(a1) +; RV32I-NEXT: lbu t5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or s0, t5, a7 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t3, t3, a7 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: or t6, a1, t5 ; RV32I-NEXT: li t5, 32 -; RV32I-NEXT: slli a7, a4, 16 -; RV32I-NEXT: slli a1, a5, 16 +; RV32I-NEXT: slli a7, a5, 16 +; RV32I-NEXT: slli a1, t0, 16 ; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli a5, t2, 16 -; RV32I-NEXT: or t2, t4, t3 -; RV32I-NEXT: or a4, t1, t0 -; RV32I-NEXT: or a5, a5, s0 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: or t2, t4, a4 +; RV32I-NEXT: or a4, a5, t1 +; RV32I-NEXT: or a5, t6, t3 ; RV32I-NEXT: slli a5, a5, 5 ; RV32I-NEXT: srl s0, t2, a5 ; RV32I-NEXT: neg s6, a5 @@ -9559,6 +9513,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: .LBB19_18: ; RV32I-NEXT: neg s11, s9 ; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s9, t5, .LBB19_20 ; RV32I-NEXT: # %bb.19: ; RV32I-NEXT: sra s0, a4, s9 @@ -9567,20 +9522,19 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sll a3, a4, s11 ; RV32I-NEXT: or s0, s0, a3 ; RV32I-NEXT: .LBB19_21: -; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu s3, 11(a0) +; RV32I-NEXT: lbu s1, 11(a0) ; RV32I-NEXT: lbu a3, 15(a0) ; RV32I-NEXT: mv t4, t2 ; RV32I-NEXT: beqz s9, .LBB19_23 ; RV32I-NEXT: # %bb.22: ; RV32I-NEXT: mv t4, s0 ; RV32I-NEXT: .LBB19_23: +; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: lbu s2, 9(a0) -; RV32I-NEXT: lbu s1, 10(a0) +; RV32I-NEXT: lbu s3, 10(a0) ; RV32I-NEXT: lbu s8, 13(a0) ; RV32I-NEXT: lbu ra, 14(a0) -; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: bltu s9, t5, .LBB19_25 ; RV32I-NEXT: # %bb.24: @@ -9589,12 +9543,12 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: .LBB19_25: ; RV32I-NEXT: sra s0, a4, a5 ; RV32I-NEXT: .LBB19_26: -; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: or s1, s1, s3 ; RV32I-NEXT: lbu s5, 8(a0) ; RV32I-NEXT: lbu s3, 12(a0) -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s4, s8, 8 -; RV32I-NEXT: or s8, a3, ra +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or ra, a3, ra ; RV32I-NEXT: bgeu a5, t6, .LBB19_28 ; RV32I-NEXT: # %bb.27: ; RV32I-NEXT: or t4, t0, a6 @@ -9604,8 +9558,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: lbu t3, 7(a0) ; RV32I-NEXT: or a6, s2, s5 ; RV32I-NEXT: slli s2, s1, 16 -; RV32I-NEXT: or s1, s4, s3 -; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: or s1, s8, s3 +; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: mv a1, t1 ; RV32I-NEXT: mv t0, a7 ; RV32I-NEXT: beqz a5, .LBB19_30 @@ -9613,26 +9567,26 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: mv a1, t4 ; RV32I-NEXT: mv t0, s0 ; RV32I-NEXT: .LBB19_30: -; RV32I-NEXT: slli s5, a3, 8 -; RV32I-NEXT: lbu ra, 1(a0) -; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: slli s8, a3, 8 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu s4, 2(a0) ; RV32I-NEXT: lbu s3, 5(a0) ; RV32I-NEXT: lbu s0, 6(a0) -; RV32I-NEXT: slli s4, t3, 8 +; RV32I-NEXT: slli s5, t3, 8 ; RV32I-NEXT: or t4, s2, a6 -; RV32I-NEXT: or t3, s8, s1 +; RV32I-NEXT: or t3, ra, s1 ; RV32I-NEXT: bltu a5, t6, .LBB19_32 ; RV32I-NEXT: # %bb.31: ; RV32I-NEXT: srai a6, a4, 31 ; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: .LBB19_32: -; RV32I-NEXT: slli a6, ra, 8 -; RV32I-NEXT: or a3, s5, a3 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a6, s8, s4 ; RV32I-NEXT: lbu s1, 0(a0) ; RV32I-NEXT: lbu a0, 4(a0) ; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s0, s4, s0 +; RV32I-NEXT: or s0, s5, s0 ; RV32I-NEXT: srl s2, t4, a5 ; RV32I-NEXT: sll ra, t3, s6 ; RV32I-NEXT: bltu a5, t5, .LBB19_34 @@ -9642,8 +9596,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: .LBB19_34: ; RV32I-NEXT: or s4, s2, ra ; RV32I-NEXT: .LBB19_35: -; RV32I-NEXT: or a6, a6, s1 -; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: or a3, a3, s1 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: or a0, s3, a0 ; RV32I-NEXT: slli s1, s0, 16 ; RV32I-NEXT: mv s5, t4 @@ -9651,7 +9605,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: # %bb.36: ; RV32I-NEXT: mv s5, s4 ; RV32I-NEXT: .LBB19_37: -; RV32I-NEXT: or s0, a3, a6 +; RV32I-NEXT: or s0, a6, a3 ; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: bltu a5, t5, .LBB19_39 ; RV32I-NEXT: # %bb.38: @@ -10089,88 +10043,88 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: lbu s8, 20(a0) ; RV64I-NEXT: lbu s9, 21(a0) ; RV64I-NEXT: lbu s10, 22(a0) ; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: slli t2, t2, 8 ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 ; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: slli s3, s3, 8 ; RV64I-NEXT: or a4, t4, t3 ; RV64I-NEXT: or a6, t6, t5 -; RV64I-NEXT: or t0, s1, s0 -; RV64I-NEXT: lbu t5, 24(a0) -; RV64I-NEXT: lbu t6, 25(a0) -; RV64I-NEXT: lbu s0, 26(a0) -; RV64I-NEXT: lbu s1, 27(a0) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t3, 24(a0) +; RV64I-NEXT: lbu t4, 25(a0) +; RV64I-NEXT: lbu t5, 26(a0) +; RV64I-NEXT: lbu t6, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s7, s7, 8 -; RV64I-NEXT: or t4, s3, s2 -; RV64I-NEXT: or t2, s5, s4 -; RV64I-NEXT: or t3, s7, s6 -; RV64I-NEXT: lbu s2, 28(a0) -; RV64I-NEXT: lbu s3, 29(a0) -; RV64I-NEXT: lbu s4, 30(a0) -; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or s0, s5, s4 +; RV64I-NEXT: or s1, s7, s6 +; RV64I-NEXT: or s2, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s4, 29(a0) +; RV64I-NEXT: lbu s5, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli s11, s11, 8 +; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s5, s9, s8 +; RV64I-NEXT: slli s4, s4, 8 ; RV64I-NEXT: or s6, s11, s10 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: or t3, t4, t3 +; RV64I-NEXT: or t4, t6, t5 +; RV64I-NEXT: or t5, s4, s3 ; RV64I-NEXT: lbu t6, 0(a1) -; RV64I-NEXT: lbu s1, 1(a1) -; RV64I-NEXT: lbu s7, 2(a1) -; RV64I-NEXT: lbu s8, 3(a1) -; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: or s3, a0, s4 -; RV64I-NEXT: or t6, s1, t6 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s7, s7, 8 +; RV64I-NEXT: or s5, a0, s5 +; RV64I-NEXT: or t6, s3, t6 +; RV64I-NEXT: or s3, s7, s4 ; RV64I-NEXT: lbu a0, 4(a1) -; RV64I-NEXT: lbu s1, 5(a1) -; RV64I-NEXT: lbu s4, 6(a1) +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: or s7, s8, s7 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or s1, s1, a0 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, a0 ; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or s4, a1, s4 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: or a1, t1, a7 -; RV64I-NEXT: slli t4, t4, 16 -; RV64I-NEXT: or a0, t4, t0 -; RV64I-NEXT: slli t3, t3, 16 -; RV64I-NEXT: or a7, t3, t2 +; RV64I-NEXT: or s7, a1, s7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a1, t0, a7 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: or a0, t2, t1 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: or s0, s1, s0 ; RV64I-NEXT: slli s6, s6, 16 -; RV64I-NEXT: or t1, s6, s5 -; RV64I-NEXT: slli s0, s0, 16 -; RV64I-NEXT: or t4, s0, t5 +; RV64I-NEXT: or a7, s6, s2 +; RV64I-NEXT: slli t4, t4, 16 +; RV64I-NEXT: or t1, t4, t3 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: or t4, s5, t5 ; RV64I-NEXT: slli s3, s3, 16 -; RV64I-NEXT: or t5, s3, s2 +; RV64I-NEXT: or t5, s3, t6 ; RV64I-NEXT: slli s7, s7, 16 -; RV64I-NEXT: or t6, s7, t6 -; RV64I-NEXT: slli s4, s4, 16 -; RV64I-NEXT: or s0, s4, s1 +; RV64I-NEXT: or t6, s7, s4 ; RV64I-NEXT: li t0, 64 ; RV64I-NEXT: slli t3, a5, 16 ; RV64I-NEXT: slli t2, a6, 16 -; RV64I-NEXT: slli t1, t1, 32 -; RV64I-NEXT: slli t5, t5, 32 -; RV64I-NEXT: slli s0, s0, 32 -; RV64I-NEXT: or a7, t1, a7 -; RV64I-NEXT: or a5, t5, t4 -; RV64I-NEXT: or a6, s0, t6 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli t4, t4, 32 +; RV64I-NEXT: slli t6, t6, 32 +; RV64I-NEXT: or a7, a7, s0 +; RV64I-NEXT: or a5, t4, t1 +; RV64I-NEXT: or a6, t6, t5 ; RV64I-NEXT: slli a6, a6, 6 ; RV64I-NEXT: subw t1, a6, t0 ; RV64I-NEXT: negw t5, a6 @@ -10384,47 +10338,47 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: lbu t0, 21(a0) ; RV32I-NEXT: lbu t1, 22(a0) ; RV32I-NEXT: lbu t2, 23(a0) -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu t4, 25(a0) -; RV32I-NEXT: lbu t5, 26(a0) -; RV32I-NEXT: lbu t6, 27(a0) ; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 24(a0) +; RV32I-NEXT: lbu t3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) ; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a5, t2, t1 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a4, t3, a4 ; RV32I-NEXT: lbu a7, 28(a0) -; RV32I-NEXT: lbu t0, 29(a0) -; RV32I-NEXT: lbu t1, 30(a0) -; RV32I-NEXT: lbu t2, 31(a0) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: or t4, t6, t5 -; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: lbu t1, 29(a0) +; RV32I-NEXT: lbu t2, 30(a0) +; RV32I-NEXT: lbu t3, 31(a0) +; RV32I-NEXT: slli t5, t5, 8 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t1, t1, a7 +; RV32I-NEXT: or t2, t3, t2 ; RV32I-NEXT: lbu a7, 0(a1) -; RV32I-NEXT: lbu t5, 1(a1) -; RV32I-NEXT: lbu t6, 2(a1) +; RV32I-NEXT: lbu t3, 1(a1) +; RV32I-NEXT: lbu t5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or s0, t5, a7 +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t3, t3, a7 ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or t2, a1, t6 +; RV32I-NEXT: or t6, a1, t5 ; RV32I-NEXT: li t5, 32 -; RV32I-NEXT: slli a7, a4, 16 -; RV32I-NEXT: slli a1, a5, 16 +; RV32I-NEXT: slli a7, a5, 16 +; RV32I-NEXT: slli a1, t0, 16 ; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli a5, t2, 16 -; RV32I-NEXT: or t2, t4, t3 -; RV32I-NEXT: or a4, t1, t0 -; RV32I-NEXT: or a5, a5, s0 +; RV32I-NEXT: slli t6, t6, 16 +; RV32I-NEXT: or t2, t4, a4 +; RV32I-NEXT: or a4, a5, t1 +; RV32I-NEXT: or a5, t6, t3 ; RV32I-NEXT: slli a5, a5, 6 ; RV32I-NEXT: srl s0, t2, a5 ; RV32I-NEXT: neg s6, a5 @@ -10490,6 +10444,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: .LBB20_18: ; RV32I-NEXT: neg s11, s9 ; RV32I-NEXT: sw s0, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: bltu s9, t5, .LBB20_20 ; RV32I-NEXT: # %bb.19: ; RV32I-NEXT: sra s0, a4, s9 @@ -10498,20 +10453,19 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sll a3, a4, s11 ; RV32I-NEXT: or s0, s0, a3 ; RV32I-NEXT: .LBB20_21: -; RV32I-NEXT: sw s1, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t4, 36(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu s3, 11(a0) +; RV32I-NEXT: lbu s1, 11(a0) ; RV32I-NEXT: lbu a3, 15(a0) ; RV32I-NEXT: mv t4, t2 ; RV32I-NEXT: beqz s9, .LBB20_23 ; RV32I-NEXT: # %bb.22: ; RV32I-NEXT: mv t4, s0 ; RV32I-NEXT: .LBB20_23: +; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: lbu s2, 9(a0) -; RV32I-NEXT: lbu s1, 10(a0) +; RV32I-NEXT: lbu s3, 10(a0) ; RV32I-NEXT: lbu s8, 13(a0) ; RV32I-NEXT: lbu ra, 14(a0) -; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: bltu s9, t5, .LBB20_25 ; RV32I-NEXT: # %bb.24: @@ -10520,12 +10474,12 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: .LBB20_25: ; RV32I-NEXT: sra s0, a4, a5 ; RV32I-NEXT: .LBB20_26: -; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: or s1, s1, s3 ; RV32I-NEXT: lbu s5, 8(a0) ; RV32I-NEXT: lbu s3, 12(a0) -; RV32I-NEXT: slli s2, s2, 8 -; RV32I-NEXT: slli s4, s8, 8 -; RV32I-NEXT: or s8, a3, ra +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: or ra, a3, ra ; RV32I-NEXT: bgeu a5, t6, .LBB20_28 ; RV32I-NEXT: # %bb.27: ; RV32I-NEXT: or t4, t0, a6 @@ -10535,8 +10489,8 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: lbu t3, 7(a0) ; RV32I-NEXT: or a6, s2, s5 ; RV32I-NEXT: slli s2, s1, 16 -; RV32I-NEXT: or s1, s4, s3 -; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: or s1, s8, s3 +; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: mv a1, t1 ; RV32I-NEXT: mv t0, a7 ; RV32I-NEXT: beqz a5, .LBB20_30 @@ -10544,26 +10498,26 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: mv a1, t4 ; RV32I-NEXT: mv t0, s0 ; RV32I-NEXT: .LBB20_30: -; RV32I-NEXT: slli s5, a3, 8 -; RV32I-NEXT: lbu ra, 1(a0) -; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: slli s8, a3, 8 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu s4, 2(a0) ; RV32I-NEXT: lbu s3, 5(a0) ; RV32I-NEXT: lbu s0, 6(a0) -; RV32I-NEXT: slli s4, t3, 8 +; RV32I-NEXT: slli s5, t3, 8 ; RV32I-NEXT: or t4, s2, a6 -; RV32I-NEXT: or t3, s8, s1 +; RV32I-NEXT: or t3, ra, s1 ; RV32I-NEXT: bltu a5, t6, .LBB20_32 ; RV32I-NEXT: # %bb.31: ; RV32I-NEXT: srai a6, a4, 31 ; RV32I-NEXT: sw a6, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw a6, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: .LBB20_32: -; RV32I-NEXT: slli a6, ra, 8 -; RV32I-NEXT: or a3, s5, a3 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a6, s8, s4 ; RV32I-NEXT: lbu s1, 0(a0) ; RV32I-NEXT: lbu a0, 4(a0) ; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s0, s4, s0 +; RV32I-NEXT: or s0, s5, s0 ; RV32I-NEXT: srl s2, t4, a5 ; RV32I-NEXT: sll ra, t3, s6 ; RV32I-NEXT: bltu a5, t5, .LBB20_34 @@ -10573,8 +10527,8 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: .LBB20_34: ; RV32I-NEXT: or s4, s2, ra ; RV32I-NEXT: .LBB20_35: -; RV32I-NEXT: or a6, a6, s1 -; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: or a3, a3, s1 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: or a0, s3, a0 ; RV32I-NEXT: slli s1, s0, 16 ; RV32I-NEXT: mv s5, t4 @@ -10582,7 +10536,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: # %bb.36: ; RV32I-NEXT: mv s5, s4 ; RV32I-NEXT: .LBB20_37: -; RV32I-NEXT: or s0, a3, a6 +; RV32I-NEXT: or s0, a6, a3 ; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: bltu a5, t5, .LBB20_39 ; RV32I-NEXT: # %bb.38: diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll index f60def9d546f8..a50c303819f23 100644 --- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll @@ -39,6 +39,7 @@ ; CHECK-NEXT: RISC-V DAG->DAG Pattern Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation +; CHECK-NEXT: RISC-V VMV0 Elimination ; CHECK-NEXT: RISC-V Pre-RA pseudo instruction expansion pass ; CHECK-NEXT: RISC-V Insert Read/Write CSR Pass ; CHECK-NEXT: RISC-V Insert Write VXRM Pass diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index 668c734612447..2646dfeca4eb6 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -97,6 +97,7 @@ ; CHECK-NEXT: RISC-V DAG->DAG Pattern Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: RISC-V Vector Peephole Optimization +; CHECK-NEXT: RISC-V VMV0 Elimination ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Early Tail Duplication ; CHECK-NEXT: Optimize machine instruction PHIs diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll index c9a48acb8d14a..d7290e1e65540 100644 --- a/llvm/test/CodeGen/RISCV/abds-neg.ll +++ b/llvm/test/CodeGen/RISCV/abds-neg.ll @@ -625,42 +625,42 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t1, 12(a1) +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: lw a5, 4(a2) ; RV32I-NEXT: lw t0, 8(a2) ; RV32I-NEXT: lw t2, 12(a2) -; RV32I-NEXT: lw a1, 0(a2) -; RV32I-NEXT: lw a2, 4(a2) ; RV32I-NEXT: sltu t3, t0, a6 -; RV32I-NEXT: mv t4, t3 +; RV32I-NEXT: mv t5, t3 ; RV32I-NEXT: beq t1, t2, .LBB11_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t2, t1 +; RV32I-NEXT: slt t5, t2, t1 ; RV32I-NEXT: .LBB11_2: -; RV32I-NEXT: sltu a5, a1, a3 -; RV32I-NEXT: sltu t6, a2, a4 -; RV32I-NEXT: mv a7, a5 -; RV32I-NEXT: beq a4, a2, .LBB11_4 +; RV32I-NEXT: sltu t4, a5, a4 +; RV32I-NEXT: sltu a2, a1, a3 +; RV32I-NEXT: mv a7, a2 +; RV32I-NEXT: beq a4, a5, .LBB11_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv a7, t6 +; RV32I-NEXT: mv a7, t4 ; RV32I-NEXT: .LBB11_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: xor t5, t1, t2 +; RV32I-NEXT: xor t6, t1, t2 ; RV32I-NEXT: xor s0, a6, t0 -; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: beqz t5, .LBB11_6 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: beqz t6, .LBB11_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv a7, t4 +; RV32I-NEXT: mv a7, t5 ; RV32I-NEXT: .LBB11_6: -; RV32I-NEXT: mv t5, a5 -; RV32I-NEXT: beq a2, a4, .LBB11_8 +; RV32I-NEXT: mv t5, a2 +; RV32I-NEXT: beq a5, a4, .LBB11_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv t5, t6 +; RV32I-NEXT: mv t5, t4 ; RV32I-NEXT: .LBB11_8: ; RV32I-NEXT: sltu t4, a3, a1 ; RV32I-NEXT: mv t6, t4 -; RV32I-NEXT: beq a4, a2, .LBB11_10 +; RV32I-NEXT: beq a4, a5, .LBB11_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t6, a4, a2 +; RV32I-NEXT: sltu t6, a4, a5 ; RV32I-NEXT: .LBB11_10: ; RV32I-NEXT: bnez a7, .LBB11_12 ; RV32I-NEXT: # %bb.11: @@ -684,12 +684,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: add t0, t0, t1 ; RV32I-NEXT: bnez a7, .LBB11_15 ; RV32I-NEXT: # %bb.14: -; RV32I-NEXT: sub a2, a2, a4 -; RV32I-NEXT: sub a2, a2, a5 +; RV32I-NEXT: sub a5, a5, a4 +; RV32I-NEXT: sub a2, a5, a2 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: j .LBB11_16 ; RV32I-NEXT: .LBB11_15: -; RV32I-NEXT: sub a4, a4, a2 +; RV32I-NEXT: sub a4, a4, a5 ; RV32I-NEXT: sub a2, a4, t4 ; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: .LBB11_16: @@ -744,42 +744,42 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t1, 12(a1) +; RV32ZBB-NEXT: lw a1, 0(a2) +; RV32ZBB-NEXT: lw a5, 4(a2) ; RV32ZBB-NEXT: lw t0, 8(a2) ; RV32ZBB-NEXT: lw t2, 12(a2) -; RV32ZBB-NEXT: lw a1, 0(a2) -; RV32ZBB-NEXT: lw a2, 4(a2) ; RV32ZBB-NEXT: sltu t3, t0, a6 -; RV32ZBB-NEXT: mv t4, t3 +; RV32ZBB-NEXT: mv t5, t3 ; RV32ZBB-NEXT: beq t1, t2, .LBB11_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t2, t1 +; RV32ZBB-NEXT: slt t5, t2, t1 ; RV32ZBB-NEXT: .LBB11_2: -; RV32ZBB-NEXT: sltu a5, a1, a3 -; RV32ZBB-NEXT: sltu t6, a2, a4 -; RV32ZBB-NEXT: mv a7, a5 -; RV32ZBB-NEXT: beq a4, a2, .LBB11_4 +; RV32ZBB-NEXT: sltu t4, a5, a4 +; RV32ZBB-NEXT: sltu a2, a1, a3 +; RV32ZBB-NEXT: mv a7, a2 +; RV32ZBB-NEXT: beq a4, a5, .LBB11_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: mv a7, t6 +; RV32ZBB-NEXT: mv a7, t4 ; RV32ZBB-NEXT: .LBB11_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBB-NEXT: xor t5, t1, t2 +; RV32ZBB-NEXT: xor t6, t1, t2 ; RV32ZBB-NEXT: xor s0, a6, t0 -; RV32ZBB-NEXT: or t5, s0, t5 -; RV32ZBB-NEXT: beqz t5, .LBB11_6 +; RV32ZBB-NEXT: or t6, s0, t6 +; RV32ZBB-NEXT: beqz t6, .LBB11_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv a7, t4 +; RV32ZBB-NEXT: mv a7, t5 ; RV32ZBB-NEXT: .LBB11_6: -; RV32ZBB-NEXT: mv t5, a5 -; RV32ZBB-NEXT: beq a2, a4, .LBB11_8 +; RV32ZBB-NEXT: mv t5, a2 +; RV32ZBB-NEXT: beq a5, a4, .LBB11_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: mv t5, t6 +; RV32ZBB-NEXT: mv t5, t4 ; RV32ZBB-NEXT: .LBB11_8: ; RV32ZBB-NEXT: sltu t4, a3, a1 ; RV32ZBB-NEXT: mv t6, t4 -; RV32ZBB-NEXT: beq a4, a2, .LBB11_10 +; RV32ZBB-NEXT: beq a4, a5, .LBB11_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t6, a4, a2 +; RV32ZBB-NEXT: sltu t6, a4, a5 ; RV32ZBB-NEXT: .LBB11_10: ; RV32ZBB-NEXT: bnez a7, .LBB11_12 ; RV32ZBB-NEXT: # %bb.11: @@ -803,12 +803,12 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: add t0, t0, t1 ; RV32ZBB-NEXT: bnez a7, .LBB11_15 ; RV32ZBB-NEXT: # %bb.14: -; RV32ZBB-NEXT: sub a2, a2, a4 -; RV32ZBB-NEXT: sub a2, a2, a5 +; RV32ZBB-NEXT: sub a5, a5, a4 +; RV32ZBB-NEXT: sub a2, a5, a2 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: j .LBB11_16 ; RV32ZBB-NEXT: .LBB11_15: -; RV32ZBB-NEXT: sub a4, a4, a2 +; RV32ZBB-NEXT: sub a4, a4, a5 ; RV32ZBB-NEXT: sub a2, a4, t4 ; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: .LBB11_16: @@ -872,42 +872,42 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t1, 12(a1) +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: lw a5, 4(a2) ; RV32I-NEXT: lw t0, 8(a2) ; RV32I-NEXT: lw t2, 12(a2) -; RV32I-NEXT: lw a1, 0(a2) -; RV32I-NEXT: lw a2, 4(a2) ; RV32I-NEXT: sltu t3, t0, a6 -; RV32I-NEXT: mv t4, t3 +; RV32I-NEXT: mv t5, t3 ; RV32I-NEXT: beq t1, t2, .LBB12_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t2, t1 +; RV32I-NEXT: slt t5, t2, t1 ; RV32I-NEXT: .LBB12_2: -; RV32I-NEXT: sltu a5, a1, a3 -; RV32I-NEXT: sltu t6, a2, a4 -; RV32I-NEXT: mv a7, a5 -; RV32I-NEXT: beq a4, a2, .LBB12_4 +; RV32I-NEXT: sltu t4, a5, a4 +; RV32I-NEXT: sltu a2, a1, a3 +; RV32I-NEXT: mv a7, a2 +; RV32I-NEXT: beq a4, a5, .LBB12_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: mv a7, t6 +; RV32I-NEXT: mv a7, t4 ; RV32I-NEXT: .LBB12_4: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: xor t5, t1, t2 +; RV32I-NEXT: xor t6, t1, t2 ; RV32I-NEXT: xor s0, a6, t0 -; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: beqz t5, .LBB12_6 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: beqz t6, .LBB12_6 ; RV32I-NEXT: # %bb.5: -; RV32I-NEXT: mv a7, t4 +; RV32I-NEXT: mv a7, t5 ; RV32I-NEXT: .LBB12_6: -; RV32I-NEXT: mv t5, a5 -; RV32I-NEXT: beq a2, a4, .LBB12_8 +; RV32I-NEXT: mv t5, a2 +; RV32I-NEXT: beq a5, a4, .LBB12_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv t5, t6 +; RV32I-NEXT: mv t5, t4 ; RV32I-NEXT: .LBB12_8: ; RV32I-NEXT: sltu t4, a3, a1 ; RV32I-NEXT: mv t6, t4 -; RV32I-NEXT: beq a4, a2, .LBB12_10 +; RV32I-NEXT: beq a4, a5, .LBB12_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t6, a4, a2 +; RV32I-NEXT: sltu t6, a4, a5 ; RV32I-NEXT: .LBB12_10: ; RV32I-NEXT: bnez a7, .LBB12_12 ; RV32I-NEXT: # %bb.11: @@ -931,12 +931,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: add t0, t0, t1 ; RV32I-NEXT: bnez a7, .LBB12_15 ; RV32I-NEXT: # %bb.14: -; RV32I-NEXT: sub a2, a2, a4 -; RV32I-NEXT: sub a2, a2, a5 +; RV32I-NEXT: sub a5, a5, a4 +; RV32I-NEXT: sub a2, a5, a2 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: j .LBB12_16 ; RV32I-NEXT: .LBB12_15: -; RV32I-NEXT: sub a4, a4, a2 +; RV32I-NEXT: sub a4, a4, a5 ; RV32I-NEXT: sub a2, a4, t4 ; RV32I-NEXT: sub a1, a3, a1 ; RV32I-NEXT: .LBB12_16: @@ -991,42 +991,42 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t1, 12(a1) +; RV32ZBB-NEXT: lw a1, 0(a2) +; RV32ZBB-NEXT: lw a5, 4(a2) ; RV32ZBB-NEXT: lw t0, 8(a2) ; RV32ZBB-NEXT: lw t2, 12(a2) -; RV32ZBB-NEXT: lw a1, 0(a2) -; RV32ZBB-NEXT: lw a2, 4(a2) ; RV32ZBB-NEXT: sltu t3, t0, a6 -; RV32ZBB-NEXT: mv t4, t3 +; RV32ZBB-NEXT: mv t5, t3 ; RV32ZBB-NEXT: beq t1, t2, .LBB12_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t2, t1 +; RV32ZBB-NEXT: slt t5, t2, t1 ; RV32ZBB-NEXT: .LBB12_2: -; RV32ZBB-NEXT: sltu a5, a1, a3 -; RV32ZBB-NEXT: sltu t6, a2, a4 -; RV32ZBB-NEXT: mv a7, a5 -; RV32ZBB-NEXT: beq a4, a2, .LBB12_4 +; RV32ZBB-NEXT: sltu t4, a5, a4 +; RV32ZBB-NEXT: sltu a2, a1, a3 +; RV32ZBB-NEXT: mv a7, a2 +; RV32ZBB-NEXT: beq a4, a5, .LBB12_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: mv a7, t6 +; RV32ZBB-NEXT: mv a7, t4 ; RV32ZBB-NEXT: .LBB12_4: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBB-NEXT: xor t5, t1, t2 +; RV32ZBB-NEXT: xor t6, t1, t2 ; RV32ZBB-NEXT: xor s0, a6, t0 -; RV32ZBB-NEXT: or t5, s0, t5 -; RV32ZBB-NEXT: beqz t5, .LBB12_6 +; RV32ZBB-NEXT: or t6, s0, t6 +; RV32ZBB-NEXT: beqz t6, .LBB12_6 ; RV32ZBB-NEXT: # %bb.5: -; RV32ZBB-NEXT: mv a7, t4 +; RV32ZBB-NEXT: mv a7, t5 ; RV32ZBB-NEXT: .LBB12_6: -; RV32ZBB-NEXT: mv t5, a5 -; RV32ZBB-NEXT: beq a2, a4, .LBB12_8 +; RV32ZBB-NEXT: mv t5, a2 +; RV32ZBB-NEXT: beq a5, a4, .LBB12_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: mv t5, t6 +; RV32ZBB-NEXT: mv t5, t4 ; RV32ZBB-NEXT: .LBB12_8: ; RV32ZBB-NEXT: sltu t4, a3, a1 ; RV32ZBB-NEXT: mv t6, t4 -; RV32ZBB-NEXT: beq a4, a2, .LBB12_10 +; RV32ZBB-NEXT: beq a4, a5, .LBB12_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t6, a4, a2 +; RV32ZBB-NEXT: sltu t6, a4, a5 ; RV32ZBB-NEXT: .LBB12_10: ; RV32ZBB-NEXT: bnez a7, .LBB12_12 ; RV32ZBB-NEXT: # %bb.11: @@ -1050,12 +1050,12 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: add t0, t0, t1 ; RV32ZBB-NEXT: bnez a7, .LBB12_15 ; RV32ZBB-NEXT: # %bb.14: -; RV32ZBB-NEXT: sub a2, a2, a4 -; RV32ZBB-NEXT: sub a2, a2, a5 +; RV32ZBB-NEXT: sub a5, a5, a4 +; RV32ZBB-NEXT: sub a2, a5, a2 ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: j .LBB12_16 ; RV32ZBB-NEXT: .LBB12_15: -; RV32ZBB-NEXT: sub a4, a4, a2 +; RV32ZBB-NEXT: sub a4, a4, a5 ; RV32ZBB-NEXT: sub a2, a4, t4 ; RV32ZBB-NEXT: sub a1, a3, a1 ; RV32ZBB-NEXT: .LBB12_16: @@ -1382,30 +1382,30 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind { define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_minmax_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a6, 4(a2) -; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a5, 4(a2) +; RV32I-NEXT: lw a6, 8(a2) ; RV32I-NEXT: lw t0, 12(a2) -; RV32I-NEXT: lw a5, 12(a1) ; RV32I-NEXT: lw a3, 4(a1) ; RV32I-NEXT: lw a4, 8(a1) -; RV32I-NEXT: beq a5, t0, .LBB17_2 +; RV32I-NEXT: lw a7, 12(a1) +; RV32I-NEXT: beq a7, t0, .LBB17_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t1, a5, t0 +; RV32I-NEXT: slt t1, a7, t0 ; RV32I-NEXT: j .LBB17_3 ; RV32I-NEXT: .LBB17_2: -; RV32I-NEXT: sltu t1, a4, a7 +; RV32I-NEXT: sltu t1, a4, a6 ; RV32I-NEXT: .LBB17_3: ; RV32I-NEXT: lw t2, 0(a2) ; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: beq a3, a6, .LBB17_5 +; RV32I-NEXT: beq a3, a5, .LBB17_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: sltu t6, a3, a6 +; RV32I-NEXT: sltu t6, a3, a5 ; RV32I-NEXT: j .LBB17_6 ; RV32I-NEXT: .LBB17_5: ; RV32I-NEXT: sltu t6, a1, t2 ; RV32I-NEXT: .LBB17_6: -; RV32I-NEXT: xor a2, a5, t0 -; RV32I-NEXT: xor t3, a4, a7 +; RV32I-NEXT: xor a2, a7, t0 +; RV32I-NEXT: xor t3, a4, a6 ; RV32I-NEXT: or t5, t3, a2 ; RV32I-NEXT: beqz t5, .LBB17_8 ; RV32I-NEXT: # %bb.7: @@ -1413,27 +1413,27 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB17_8: ; RV32I-NEXT: mv a2, a1 ; RV32I-NEXT: mv t1, a3 -; RV32I-NEXT: mv t4, a5 +; RV32I-NEXT: mv t4, a7 ; RV32I-NEXT: mv t3, a4 ; RV32I-NEXT: bnez t6, .LBB17_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv a2, t2 -; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: mv t1, a5 ; RV32I-NEXT: mv t4, t0 -; RV32I-NEXT: mv t3, a7 +; RV32I-NEXT: mv t3, a6 ; RV32I-NEXT: .LBB17_10: -; RV32I-NEXT: beq a5, t0, .LBB17_12 +; RV32I-NEXT: beq a7, t0, .LBB17_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: slt t6, t0, a5 +; RV32I-NEXT: slt t6, t0, a7 ; RV32I-NEXT: j .LBB17_13 ; RV32I-NEXT: .LBB17_12: -; RV32I-NEXT: sltu t6, a7, a4 +; RV32I-NEXT: sltu t6, a6, a4 ; RV32I-NEXT: .LBB17_13: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: beq a3, a6, .LBB17_15 +; RV32I-NEXT: beq a3, a5, .LBB17_15 ; RV32I-NEXT: # %bb.14: -; RV32I-NEXT: sltu s0, a6, a3 +; RV32I-NEXT: sltu s0, a5, a3 ; RV32I-NEXT: bnez t5, .LBB17_16 ; RV32I-NEXT: j .LBB17_17 ; RV32I-NEXT: .LBB17_15: @@ -1445,14 +1445,14 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: bnez s0, .LBB17_19 ; RV32I-NEXT: # %bb.18: ; RV32I-NEXT: mv a1, t2 -; RV32I-NEXT: mv a3, a6 -; RV32I-NEXT: mv a5, t0 -; RV32I-NEXT: mv a4, a7 +; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a7, t0 +; RV32I-NEXT: mv a4, a6 ; RV32I-NEXT: .LBB17_19: -; RV32I-NEXT: sltu a7, t3, a4 -; RV32I-NEXT: sub a5, t4, a5 +; RV32I-NEXT: sltu a5, t3, a4 +; RV32I-NEXT: sub a6, t4, a7 +; RV32I-NEXT: sub a5, a6, a5 ; RV32I-NEXT: sltu a6, a2, a1 -; RV32I-NEXT: sub a5, a5, a7 ; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beq t1, a3, .LBB17_21 ; RV32I-NEXT: # %bb.20: @@ -1509,30 +1509,30 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; ; RV32ZBB-LABEL: abd_minmax_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a6, 4(a2) -; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a5, 4(a2) +; RV32ZBB-NEXT: lw a6, 8(a2) ; RV32ZBB-NEXT: lw t0, 12(a2) -; RV32ZBB-NEXT: lw a5, 12(a1) ; RV32ZBB-NEXT: lw a3, 4(a1) ; RV32ZBB-NEXT: lw a4, 8(a1) -; RV32ZBB-NEXT: beq a5, t0, .LBB17_2 +; RV32ZBB-NEXT: lw a7, 12(a1) +; RV32ZBB-NEXT: beq a7, t0, .LBB17_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t1, a5, t0 +; RV32ZBB-NEXT: slt t1, a7, t0 ; RV32ZBB-NEXT: j .LBB17_3 ; RV32ZBB-NEXT: .LBB17_2: -; RV32ZBB-NEXT: sltu t1, a4, a7 +; RV32ZBB-NEXT: sltu t1, a4, a6 ; RV32ZBB-NEXT: .LBB17_3: ; RV32ZBB-NEXT: lw t2, 0(a2) ; RV32ZBB-NEXT: lw a1, 0(a1) -; RV32ZBB-NEXT: beq a3, a6, .LBB17_5 +; RV32ZBB-NEXT: beq a3, a5, .LBB17_5 ; RV32ZBB-NEXT: # %bb.4: -; RV32ZBB-NEXT: sltu t6, a3, a6 +; RV32ZBB-NEXT: sltu t6, a3, a5 ; RV32ZBB-NEXT: j .LBB17_6 ; RV32ZBB-NEXT: .LBB17_5: ; RV32ZBB-NEXT: sltu t6, a1, t2 ; RV32ZBB-NEXT: .LBB17_6: -; RV32ZBB-NEXT: xor a2, a5, t0 -; RV32ZBB-NEXT: xor t3, a4, a7 +; RV32ZBB-NEXT: xor a2, a7, t0 +; RV32ZBB-NEXT: xor t3, a4, a6 ; RV32ZBB-NEXT: or t5, t3, a2 ; RV32ZBB-NEXT: beqz t5, .LBB17_8 ; RV32ZBB-NEXT: # %bb.7: @@ -1540,27 +1540,27 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB17_8: ; RV32ZBB-NEXT: mv a2, a1 ; RV32ZBB-NEXT: mv t1, a3 -; RV32ZBB-NEXT: mv t4, a5 +; RV32ZBB-NEXT: mv t4, a7 ; RV32ZBB-NEXT: mv t3, a4 ; RV32ZBB-NEXT: bnez t6, .LBB17_10 ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: mv a2, t2 -; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: mv t1, a5 ; RV32ZBB-NEXT: mv t4, t0 -; RV32ZBB-NEXT: mv t3, a7 +; RV32ZBB-NEXT: mv t3, a6 ; RV32ZBB-NEXT: .LBB17_10: -; RV32ZBB-NEXT: beq a5, t0, .LBB17_12 +; RV32ZBB-NEXT: beq a7, t0, .LBB17_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: slt t6, t0, a5 +; RV32ZBB-NEXT: slt t6, t0, a7 ; RV32ZBB-NEXT: j .LBB17_13 ; RV32ZBB-NEXT: .LBB17_12: -; RV32ZBB-NEXT: sltu t6, a7, a4 +; RV32ZBB-NEXT: sltu t6, a6, a4 ; RV32ZBB-NEXT: .LBB17_13: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBB-NEXT: beq a3, a6, .LBB17_15 +; RV32ZBB-NEXT: beq a3, a5, .LBB17_15 ; RV32ZBB-NEXT: # %bb.14: -; RV32ZBB-NEXT: sltu s0, a6, a3 +; RV32ZBB-NEXT: sltu s0, a5, a3 ; RV32ZBB-NEXT: bnez t5, .LBB17_16 ; RV32ZBB-NEXT: j .LBB17_17 ; RV32ZBB-NEXT: .LBB17_15: @@ -1572,14 +1572,14 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: bnez s0, .LBB17_19 ; RV32ZBB-NEXT: # %bb.18: ; RV32ZBB-NEXT: mv a1, t2 -; RV32ZBB-NEXT: mv a3, a6 -; RV32ZBB-NEXT: mv a5, t0 -; RV32ZBB-NEXT: mv a4, a7 +; RV32ZBB-NEXT: mv a3, a5 +; RV32ZBB-NEXT: mv a7, t0 +; RV32ZBB-NEXT: mv a4, a6 ; RV32ZBB-NEXT: .LBB17_19: -; RV32ZBB-NEXT: sltu a7, t3, a4 -; RV32ZBB-NEXT: sub a5, t4, a5 +; RV32ZBB-NEXT: sltu a5, t3, a4 +; RV32ZBB-NEXT: sub a6, t4, a7 +; RV32ZBB-NEXT: sub a5, a6, a5 ; RV32ZBB-NEXT: sltu a6, a2, a1 -; RV32ZBB-NEXT: sub a5, a5, a7 ; RV32ZBB-NEXT: mv a7, a6 ; RV32ZBB-NEXT: beq t1, a3, .LBB17_21 ; RV32ZBB-NEXT: # %bb.20: @@ -1862,26 +1862,26 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) ; RV32I-NEXT: lw a4, 4(a2) -; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a7, 12(a2) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw t0, 12(a2) ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a6, a5 -; RV32I-NEXT: mv t4, t1 -; RV32I-NEXT: beq t0, a7, .LBB22_2 +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw a7, 8(a1) +; RV32I-NEXT: lw t1, 12(a1) +; RV32I-NEXT: sltu a1, a7, a6 +; RV32I-NEXT: mv t4, a1 +; RV32I-NEXT: beq t1, t0, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t4, t0, a7 +; RV32I-NEXT: slt t4, t1, t0 ; RV32I-NEXT: .LBB22_2: ; RV32I-NEXT: sltu t2, a2, a3 ; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a1, a4, .LBB22_4 +; RV32I-NEXT: beq a5, a4, .LBB22_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t3, a1, a4 +; RV32I-NEXT: sltu t3, a5, a4 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: xor t5, t0, a7 -; RV32I-NEXT: xor t6, a6, a5 +; RV32I-NEXT: xor t5, t1, t0 +; RV32I-NEXT: xor t6, a7, a6 ; RV32I-NEXT: or t5, t6, t5 ; RV32I-NEXT: mv t6, t3 ; RV32I-NEXT: beqz t5, .LBB22_6 @@ -1890,32 +1890,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB22_6: ; RV32I-NEXT: sltu t4, a3, a2 ; RV32I-NEXT: mv t5, t4 -; RV32I-NEXT: beq a1, a4, .LBB22_8 +; RV32I-NEXT: beq a5, a4, .LBB22_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: sltu t5, a4, a1 +; RV32I-NEXT: sltu t5, a4, a5 ; RV32I-NEXT: .LBB22_8: ; RV32I-NEXT: bnez t6, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t1, a5, a6 -; RV32I-NEXT: sub a7, a7, t0 -; RV32I-NEXT: sub a5, a5, a6 -; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a6, a7, t1 -; RV32I-NEXT: sltu a7, a5, t5 -; RV32I-NEXT: sub a1, a5, t5 +; RV32I-NEXT: sltu a1, a6, a7 +; RV32I-NEXT: sub t0, t0, t1 +; RV32I-NEXT: sub a6, a6, a7 +; RV32I-NEXT: sub a4, a4, a5 +; RV32I-NEXT: sub a7, t0, a1 +; RV32I-NEXT: sltu t0, a6, t5 +; RV32I-NEXT: sub a1, a6, t5 ; RV32I-NEXT: sub a5, a4, t4 -; RV32I-NEXT: sub a4, a6, a7 +; RV32I-NEXT: sub a4, a7, t0 ; RV32I-NEXT: sub a2, a3, a2 ; RV32I-NEXT: j .LBB22_11 ; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: sub a7, t0, a7 -; RV32I-NEXT: sub a5, a6, a5 -; RV32I-NEXT: sub a4, a1, a4 -; RV32I-NEXT: sub a6, a7, t1 -; RV32I-NEXT: sltu a7, a5, t3 -; RV32I-NEXT: sub a1, a5, t3 -; RV32I-NEXT: sub a5, a4, t2 -; RV32I-NEXT: sub a4, a6, a7 +; RV32I-NEXT: sub t0, t1, t0 +; RV32I-NEXT: sub a6, a7, a6 +; RV32I-NEXT: sub a5, a5, a4 +; RV32I-NEXT: sub a4, t0, a1 +; RV32I-NEXT: sltu a7, a6, t3 +; RV32I-NEXT: sub a1, a6, t3 +; RV32I-NEXT: sub a5, a5, t2 +; RV32I-NEXT: sub a4, a4, a7 ; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: .LBB22_11: ; RV32I-NEXT: sw a2, 0(a0) @@ -1949,26 +1949,26 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) ; RV32ZBB-NEXT: lw a4, 4(a2) -; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a7, 12(a2) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw t0, 12(a2) ; RV32ZBB-NEXT: lw a2, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a6, a5 -; RV32ZBB-NEXT: mv t4, t1 -; RV32ZBB-NEXT: beq t0, a7, .LBB22_2 +; RV32ZBB-NEXT: lw a5, 4(a1) +; RV32ZBB-NEXT: lw a7, 8(a1) +; RV32ZBB-NEXT: lw t1, 12(a1) +; RV32ZBB-NEXT: sltu a1, a7, a6 +; RV32ZBB-NEXT: mv t4, a1 +; RV32ZBB-NEXT: beq t1, t0, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: slt t4, t0, a7 +; RV32ZBB-NEXT: slt t4, t1, t0 ; RV32ZBB-NEXT: .LBB22_2: ; RV32ZBB-NEXT: sltu t2, a2, a3 ; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a1, a4, .LBB22_4 +; RV32ZBB-NEXT: beq a5, a4, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t3, a1, a4 +; RV32ZBB-NEXT: sltu t3, a5, a4 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: xor t5, t0, a7 -; RV32ZBB-NEXT: xor t6, a6, a5 +; RV32ZBB-NEXT: xor t5, t1, t0 +; RV32ZBB-NEXT: xor t6, a7, a6 ; RV32ZBB-NEXT: or t5, t6, t5 ; RV32ZBB-NEXT: mv t6, t3 ; RV32ZBB-NEXT: beqz t5, .LBB22_6 @@ -1977,32 +1977,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB22_6: ; RV32ZBB-NEXT: sltu t4, a3, a2 ; RV32ZBB-NEXT: mv t5, t4 -; RV32ZBB-NEXT: beq a1, a4, .LBB22_8 +; RV32ZBB-NEXT: beq a5, a4, .LBB22_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: sltu t5, a4, a1 +; RV32ZBB-NEXT: sltu t5, a4, a5 ; RV32ZBB-NEXT: .LBB22_8: ; RV32ZBB-NEXT: bnez t6, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t1, a5, a6 -; RV32ZBB-NEXT: sub a7, a7, t0 -; RV32ZBB-NEXT: sub a5, a5, a6 -; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a6, a7, t1 -; RV32ZBB-NEXT: sltu a7, a5, t5 -; RV32ZBB-NEXT: sub a1, a5, t5 +; RV32ZBB-NEXT: sltu a1, a6, a7 +; RV32ZBB-NEXT: sub t0, t0, t1 +; RV32ZBB-NEXT: sub a6, a6, a7 +; RV32ZBB-NEXT: sub a4, a4, a5 +; RV32ZBB-NEXT: sub a7, t0, a1 +; RV32ZBB-NEXT: sltu t0, a6, t5 +; RV32ZBB-NEXT: sub a1, a6, t5 ; RV32ZBB-NEXT: sub a5, a4, t4 -; RV32ZBB-NEXT: sub a4, a6, a7 +; RV32ZBB-NEXT: sub a4, a7, t0 ; RV32ZBB-NEXT: sub a2, a3, a2 ; RV32ZBB-NEXT: j .LBB22_11 ; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: sub a7, t0, a7 -; RV32ZBB-NEXT: sub a5, a6, a5 -; RV32ZBB-NEXT: sub a4, a1, a4 -; RV32ZBB-NEXT: sub a6, a7, t1 -; RV32ZBB-NEXT: sltu a7, a5, t3 -; RV32ZBB-NEXT: sub a1, a5, t3 -; RV32ZBB-NEXT: sub a5, a4, t2 -; RV32ZBB-NEXT: sub a4, a6, a7 +; RV32ZBB-NEXT: sub t0, t1, t0 +; RV32ZBB-NEXT: sub a6, a7, a6 +; RV32ZBB-NEXT: sub a5, a5, a4 +; RV32ZBB-NEXT: sub a4, t0, a1 +; RV32ZBB-NEXT: sltu a7, a6, t3 +; RV32ZBB-NEXT: sub a1, a6, t3 +; RV32ZBB-NEXT: sub a5, a5, t2 +; RV32ZBB-NEXT: sub a4, a4, a7 ; RV32ZBB-NEXT: sub a2, a2, a3 ; RV32ZBB-NEXT: .LBB22_11: ; RV32ZBB-NEXT: sw a2, 0(a0) @@ -2391,53 +2391,53 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) ; RV32I-NEXT: lw a4, 4(a2) -; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a6, 12(a2) -; RV32I-NEXT: lw t0, 8(a1) -; RV32I-NEXT: lw t1, 12(a1) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a7, 12(a2) ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a7, 4(a1) -; RV32I-NEXT: sltu a1, t0, a5 -; RV32I-NEXT: sub t1, t1, a6 -; RV32I-NEXT: sltu a6, a2, a3 -; RV32I-NEXT: sub a1, t1, a1 -; RV32I-NEXT: mv t1, a6 -; RV32I-NEXT: beq a7, a4, .LBB31_2 +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw t1, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: sltu t0, t1, a6 +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: sub a1, a1, t0 +; RV32I-NEXT: sltu a7, a2, a3 +; RV32I-NEXT: mv t0, a7 +; RV32I-NEXT: beq a5, a4, .LBB31_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a7, a4 +; RV32I-NEXT: sltu t0, a5, a4 ; RV32I-NEXT: .LBB31_2: -; RV32I-NEXT: sub a5, t0, a5 -; RV32I-NEXT: sub a4, a7, a4 -; RV32I-NEXT: sub a3, a2, a3 -; RV32I-NEXT: sltu a2, a5, t1 -; RV32I-NEXT: sub t0, a4, a6 -; RV32I-NEXT: sub a4, a5, t1 -; RV32I-NEXT: sub a5, a1, a2 +; RV32I-NEXT: sub a6, t1, a6 +; RV32I-NEXT: sub a5, a5, a4 +; RV32I-NEXT: sub a2, a2, a3 +; RV32I-NEXT: sltu a3, a6, t0 +; RV32I-NEXT: sub t1, a5, a7 +; RV32I-NEXT: sub a4, a6, t0 +; RV32I-NEXT: sub a5, a1, a3 ; RV32I-NEXT: srai a1, a5, 31 -; RV32I-NEXT: xor a2, a4, a1 -; RV32I-NEXT: xor a5, a5, a1 -; RV32I-NEXT: xor a4, a3, a1 -; RV32I-NEXT: sltu a3, a1, a2 -; RV32I-NEXT: sub a6, a1, a5 -; RV32I-NEXT: sltu a5, a1, a4 -; RV32I-NEXT: sub a3, a6, a3 -; RV32I-NEXT: xor a7, t0, a1 -; RV32I-NEXT: mv a6, a5 -; RV32I-NEXT: beqz t0, .LBB31_4 +; RV32I-NEXT: xor a3, a4, a1 +; RV32I-NEXT: xor a6, a5, a1 +; RV32I-NEXT: xor a5, t1, a1 +; RV32I-NEXT: xor a4, a2, a1 +; RV32I-NEXT: sltu a2, a1, a3 +; RV32I-NEXT: sub a6, a1, a6 +; RV32I-NEXT: sub a2, a6, a2 +; RV32I-NEXT: sltu a6, a1, a4 +; RV32I-NEXT: mv a7, a6 +; RV32I-NEXT: beqz t1, .LBB31_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu a6, a1, a7 +; RV32I-NEXT: sltu a7, a1, a5 ; RV32I-NEXT: .LBB31_4: -; RV32I-NEXT: sub a2, a1, a2 -; RV32I-NEXT: sub a7, a1, a7 +; RV32I-NEXT: sub a3, a1, a3 +; RV32I-NEXT: sub a5, a1, a5 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sltu a4, a2, a6 -; RV32I-NEXT: sub a2, a2, a6 -; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sltu a4, a3, a7 +; RV32I-NEXT: sub a3, a3, a7 +; RV32I-NEXT: sub a5, a5, a6 +; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a3, 8(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_subnsw_i128: @@ -2459,53 +2459,53 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) ; RV32ZBB-NEXT: lw a4, 4(a2) -; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a6, 12(a2) -; RV32ZBB-NEXT: lw t0, 8(a1) -; RV32ZBB-NEXT: lw t1, 12(a1) +; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a7, 12(a2) ; RV32ZBB-NEXT: lw a2, 0(a1) -; RV32ZBB-NEXT: lw a7, 4(a1) -; RV32ZBB-NEXT: sltu a1, t0, a5 -; RV32ZBB-NEXT: sub t1, t1, a6 -; RV32ZBB-NEXT: sltu a6, a2, a3 -; RV32ZBB-NEXT: sub a1, t1, a1 -; RV32ZBB-NEXT: mv t1, a6 -; RV32ZBB-NEXT: beq a7, a4, .LBB31_2 +; RV32ZBB-NEXT: lw a5, 4(a1) +; RV32ZBB-NEXT: lw t1, 8(a1) +; RV32ZBB-NEXT: lw a1, 12(a1) +; RV32ZBB-NEXT: sltu t0, t1, a6 +; RV32ZBB-NEXT: sub a1, a1, a7 +; RV32ZBB-NEXT: sub a1, a1, t0 +; RV32ZBB-NEXT: sltu a7, a2, a3 +; RV32ZBB-NEXT: mv t0, a7 +; RV32ZBB-NEXT: beq a5, a4, .LBB31_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a7, a4 +; RV32ZBB-NEXT: sltu t0, a5, a4 ; RV32ZBB-NEXT: .LBB31_2: -; RV32ZBB-NEXT: sub a5, t0, a5 -; RV32ZBB-NEXT: sub a4, a7, a4 -; RV32ZBB-NEXT: sub a3, a2, a3 -; RV32ZBB-NEXT: sltu a2, a5, t1 -; RV32ZBB-NEXT: sub t0, a4, a6 -; RV32ZBB-NEXT: sub a4, a5, t1 -; RV32ZBB-NEXT: sub a5, a1, a2 +; RV32ZBB-NEXT: sub a6, t1, a6 +; RV32ZBB-NEXT: sub a5, a5, a4 +; RV32ZBB-NEXT: sub a2, a2, a3 +; RV32ZBB-NEXT: sltu a3, a6, t0 +; RV32ZBB-NEXT: sub t1, a5, a7 +; RV32ZBB-NEXT: sub a4, a6, t0 +; RV32ZBB-NEXT: sub a5, a1, a3 ; RV32ZBB-NEXT: srai a1, a5, 31 -; RV32ZBB-NEXT: xor a2, a4, a1 -; RV32ZBB-NEXT: xor a5, a5, a1 -; RV32ZBB-NEXT: xor a4, a3, a1 -; RV32ZBB-NEXT: sltu a3, a1, a2 -; RV32ZBB-NEXT: sub a6, a1, a5 -; RV32ZBB-NEXT: sltu a5, a1, a4 -; RV32ZBB-NEXT: sub a3, a6, a3 -; RV32ZBB-NEXT: xor a7, t0, a1 -; RV32ZBB-NEXT: mv a6, a5 -; RV32ZBB-NEXT: beqz t0, .LBB31_4 +; RV32ZBB-NEXT: xor a3, a4, a1 +; RV32ZBB-NEXT: xor a6, a5, a1 +; RV32ZBB-NEXT: xor a5, t1, a1 +; RV32ZBB-NEXT: xor a4, a2, a1 +; RV32ZBB-NEXT: sltu a2, a1, a3 +; RV32ZBB-NEXT: sub a6, a1, a6 +; RV32ZBB-NEXT: sub a2, a6, a2 +; RV32ZBB-NEXT: sltu a6, a1, a4 +; RV32ZBB-NEXT: mv a7, a6 +; RV32ZBB-NEXT: beqz t1, .LBB31_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu a6, a1, a7 +; RV32ZBB-NEXT: sltu a7, a1, a5 ; RV32ZBB-NEXT: .LBB31_4: -; RV32ZBB-NEXT: sub a2, a1, a2 -; RV32ZBB-NEXT: sub a7, a1, a7 +; RV32ZBB-NEXT: sub a3, a1, a3 +; RV32ZBB-NEXT: sub a5, a1, a5 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sltu a4, a2, a6 -; RV32ZBB-NEXT: sub a2, a2, a6 -; RV32ZBB-NEXT: sub a5, a7, a5 -; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: sltu a4, a3, a7 +; RV32ZBB-NEXT: sub a3, a3, a7 +; RV32ZBB-NEXT: sub a5, a5, a6 +; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a3, 8(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_subnsw_i128: @@ -2533,53 +2533,53 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) ; RV32I-NEXT: lw a4, 4(a2) -; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a6, 12(a2) -; RV32I-NEXT: lw t0, 8(a1) -; RV32I-NEXT: lw t1, 12(a1) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a7, 12(a2) ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a7, 4(a1) -; RV32I-NEXT: sltu a1, t0, a5 -; RV32I-NEXT: sub t1, t1, a6 -; RV32I-NEXT: sltu a6, a2, a3 -; RV32I-NEXT: sub a1, t1, a1 -; RV32I-NEXT: mv t1, a6 -; RV32I-NEXT: beq a7, a4, .LBB32_2 +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw t1, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: sltu t0, t1, a6 +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: sub a1, a1, t0 +; RV32I-NEXT: sltu a7, a2, a3 +; RV32I-NEXT: mv t0, a7 +; RV32I-NEXT: beq a5, a4, .LBB32_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a7, a4 +; RV32I-NEXT: sltu t0, a5, a4 ; RV32I-NEXT: .LBB32_2: -; RV32I-NEXT: sub a5, t0, a5 -; RV32I-NEXT: sub a4, a7, a4 -; RV32I-NEXT: sub a3, a2, a3 -; RV32I-NEXT: sltu a2, a5, t1 -; RV32I-NEXT: sub t0, a4, a6 -; RV32I-NEXT: sub a4, a5, t1 -; RV32I-NEXT: sub a5, a1, a2 +; RV32I-NEXT: sub a6, t1, a6 +; RV32I-NEXT: sub a5, a5, a4 +; RV32I-NEXT: sub a2, a2, a3 +; RV32I-NEXT: sltu a3, a6, t0 +; RV32I-NEXT: sub t1, a5, a7 +; RV32I-NEXT: sub a4, a6, t0 +; RV32I-NEXT: sub a5, a1, a3 ; RV32I-NEXT: srai a1, a5, 31 -; RV32I-NEXT: xor a2, a4, a1 -; RV32I-NEXT: xor a5, a5, a1 -; RV32I-NEXT: xor a4, a3, a1 -; RV32I-NEXT: sltu a3, a1, a2 -; RV32I-NEXT: sub a6, a1, a5 -; RV32I-NEXT: sltu a5, a1, a4 -; RV32I-NEXT: sub a3, a6, a3 -; RV32I-NEXT: xor a7, t0, a1 -; RV32I-NEXT: mv a6, a5 -; RV32I-NEXT: beqz t0, .LBB32_4 +; RV32I-NEXT: xor a3, a4, a1 +; RV32I-NEXT: xor a6, a5, a1 +; RV32I-NEXT: xor a5, t1, a1 +; RV32I-NEXT: xor a4, a2, a1 +; RV32I-NEXT: sltu a2, a1, a3 +; RV32I-NEXT: sub a6, a1, a6 +; RV32I-NEXT: sub a2, a6, a2 +; RV32I-NEXT: sltu a6, a1, a4 +; RV32I-NEXT: mv a7, a6 +; RV32I-NEXT: beqz t1, .LBB32_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu a6, a1, a7 +; RV32I-NEXT: sltu a7, a1, a5 ; RV32I-NEXT: .LBB32_4: -; RV32I-NEXT: sub a2, a1, a2 -; RV32I-NEXT: sub a7, a1, a7 +; RV32I-NEXT: sub a3, a1, a3 +; RV32I-NEXT: sub a5, a1, a5 ; RV32I-NEXT: sub a1, a1, a4 -; RV32I-NEXT: sltu a4, a2, a6 -; RV32I-NEXT: sub a2, a2, a6 -; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sltu a4, a3, a7 +; RV32I-NEXT: sub a3, a3, a7 +; RV32I-NEXT: sub a5, a5, a6 +; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a3, 8(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_subnsw_i128_undef: @@ -2601,53 +2601,53 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) ; RV32ZBB-NEXT: lw a4, 4(a2) -; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a6, 12(a2) -; RV32ZBB-NEXT: lw t0, 8(a1) -; RV32ZBB-NEXT: lw t1, 12(a1) +; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a7, 12(a2) ; RV32ZBB-NEXT: lw a2, 0(a1) -; RV32ZBB-NEXT: lw a7, 4(a1) -; RV32ZBB-NEXT: sltu a1, t0, a5 -; RV32ZBB-NEXT: sub t1, t1, a6 -; RV32ZBB-NEXT: sltu a6, a2, a3 -; RV32ZBB-NEXT: sub a1, t1, a1 -; RV32ZBB-NEXT: mv t1, a6 -; RV32ZBB-NEXT: beq a7, a4, .LBB32_2 +; RV32ZBB-NEXT: lw a5, 4(a1) +; RV32ZBB-NEXT: lw t1, 8(a1) +; RV32ZBB-NEXT: lw a1, 12(a1) +; RV32ZBB-NEXT: sltu t0, t1, a6 +; RV32ZBB-NEXT: sub a1, a1, a7 +; RV32ZBB-NEXT: sub a1, a1, t0 +; RV32ZBB-NEXT: sltu a7, a2, a3 +; RV32ZBB-NEXT: mv t0, a7 +; RV32ZBB-NEXT: beq a5, a4, .LBB32_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a7, a4 +; RV32ZBB-NEXT: sltu t0, a5, a4 ; RV32ZBB-NEXT: .LBB32_2: -; RV32ZBB-NEXT: sub a5, t0, a5 -; RV32ZBB-NEXT: sub a4, a7, a4 -; RV32ZBB-NEXT: sub a3, a2, a3 -; RV32ZBB-NEXT: sltu a2, a5, t1 -; RV32ZBB-NEXT: sub t0, a4, a6 -; RV32ZBB-NEXT: sub a4, a5, t1 -; RV32ZBB-NEXT: sub a5, a1, a2 +; RV32ZBB-NEXT: sub a6, t1, a6 +; RV32ZBB-NEXT: sub a5, a5, a4 +; RV32ZBB-NEXT: sub a2, a2, a3 +; RV32ZBB-NEXT: sltu a3, a6, t0 +; RV32ZBB-NEXT: sub t1, a5, a7 +; RV32ZBB-NEXT: sub a4, a6, t0 +; RV32ZBB-NEXT: sub a5, a1, a3 ; RV32ZBB-NEXT: srai a1, a5, 31 -; RV32ZBB-NEXT: xor a2, a4, a1 -; RV32ZBB-NEXT: xor a5, a5, a1 -; RV32ZBB-NEXT: xor a4, a3, a1 -; RV32ZBB-NEXT: sltu a3, a1, a2 -; RV32ZBB-NEXT: sub a6, a1, a5 -; RV32ZBB-NEXT: sltu a5, a1, a4 -; RV32ZBB-NEXT: sub a3, a6, a3 -; RV32ZBB-NEXT: xor a7, t0, a1 -; RV32ZBB-NEXT: mv a6, a5 -; RV32ZBB-NEXT: beqz t0, .LBB32_4 +; RV32ZBB-NEXT: xor a3, a4, a1 +; RV32ZBB-NEXT: xor a6, a5, a1 +; RV32ZBB-NEXT: xor a5, t1, a1 +; RV32ZBB-NEXT: xor a4, a2, a1 +; RV32ZBB-NEXT: sltu a2, a1, a3 +; RV32ZBB-NEXT: sub a6, a1, a6 +; RV32ZBB-NEXT: sub a2, a6, a2 +; RV32ZBB-NEXT: sltu a6, a1, a4 +; RV32ZBB-NEXT: mv a7, a6 +; RV32ZBB-NEXT: beqz t1, .LBB32_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu a6, a1, a7 +; RV32ZBB-NEXT: sltu a7, a1, a5 ; RV32ZBB-NEXT: .LBB32_4: -; RV32ZBB-NEXT: sub a2, a1, a2 -; RV32ZBB-NEXT: sub a7, a1, a7 +; RV32ZBB-NEXT: sub a3, a1, a3 +; RV32ZBB-NEXT: sub a5, a1, a5 ; RV32ZBB-NEXT: sub a1, a1, a4 -; RV32ZBB-NEXT: sltu a4, a2, a6 -; RV32ZBB-NEXT: sub a2, a2, a6 -; RV32ZBB-NEXT: sub a5, a7, a5 -; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: sltu a4, a3, a7 +; RV32ZBB-NEXT: sub a3, a3, a7 +; RV32ZBB-NEXT: sub a5, a5, a6 +; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a3, 8(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_subnsw_i128_undef: diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index 56e6dacff9748..9e866220af666 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -538,18 +538,18 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) -; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: lw a5, 0(a2) ; RV32I-NEXT: lw a1, 4(a2) +; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: sltu a2, a7, a6 ; RV32I-NEXT: mv t4, a2 ; RV32I-NEXT: beq t0, t1, .LBB11_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: slt t4, t1, t0 ; RV32I-NEXT: .LBB11_2: -; RV32I-NEXT: sltu t2, a5, a3 ; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a5, a3 ; RV32I-NEXT: mv t3, t2 ; RV32I-NEXT: beq a4, a1, .LBB11_4 ; RV32I-NEXT: # %bb.3: @@ -634,18 +634,18 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) -; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: lw a5, 0(a2) ; RV32ZBB-NEXT: lw a1, 4(a2) +; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: sltu a2, a7, a6 ; RV32ZBB-NEXT: mv t4, a2 ; RV32ZBB-NEXT: beq t0, t1, .LBB11_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: slt t4, t1, t0 ; RV32ZBB-NEXT: .LBB11_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 ; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a5, a3 ; RV32ZBB-NEXT: mv t3, t2 ; RV32ZBB-NEXT: beq a4, a1, .LBB11_4 ; RV32ZBB-NEXT: # %bb.3: @@ -738,18 +738,18 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) -; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: lw a5, 0(a2) ; RV32I-NEXT: lw a1, 4(a2) +; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: sltu a2, a7, a6 ; RV32I-NEXT: mv t4, a2 ; RV32I-NEXT: beq t0, t1, .LBB12_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: slt t4, t1, t0 ; RV32I-NEXT: .LBB12_2: -; RV32I-NEXT: sltu t2, a5, a3 ; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a5, a3 ; RV32I-NEXT: mv t3, t2 ; RV32I-NEXT: beq a4, a1, .LBB12_4 ; RV32I-NEXT: # %bb.3: @@ -834,18 +834,18 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) -; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: lw a5, 0(a2) ; RV32ZBB-NEXT: lw a1, 4(a2) +; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: sltu a2, a7, a6 ; RV32ZBB-NEXT: mv t4, a2 ; RV32ZBB-NEXT: beq t0, t1, .LBB12_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: slt t4, t1, t0 ; RV32ZBB-NEXT: .LBB12_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 ; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a5, a3 ; RV32ZBB-NEXT: mv t3, t2 ; RV32ZBB-NEXT: beq a4, a1, .LBB12_4 ; RV32ZBB-NEXT: # %bb.3: @@ -1127,18 +1127,18 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) -; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: lw a5, 0(a2) ; RV32I-NEXT: lw a1, 4(a2) +; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: sltu a2, a7, a6 ; RV32I-NEXT: mv t4, a2 ; RV32I-NEXT: beq t0, t1, .LBB17_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: slt t4, t1, t0 ; RV32I-NEXT: .LBB17_2: -; RV32I-NEXT: sltu t2, a5, a3 ; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a5, a3 ; RV32I-NEXT: mv t3, t2 ; RV32I-NEXT: beq a4, a1, .LBB17_4 ; RV32I-NEXT: # %bb.3: @@ -1223,18 +1223,18 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) -; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: lw a5, 0(a2) ; RV32ZBB-NEXT: lw a1, 4(a2) +; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: sltu a2, a7, a6 ; RV32ZBB-NEXT: mv t4, a2 ; RV32ZBB-NEXT: beq t0, t1, .LBB17_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: slt t4, t1, t0 ; RV32ZBB-NEXT: .LBB17_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 ; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a5, a3 ; RV32ZBB-NEXT: mv t3, t2 ; RV32ZBB-NEXT: beq a4, a1, .LBB17_4 ; RV32ZBB-NEXT: # %bb.3: @@ -1518,18 +1518,18 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) -; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: lw a5, 0(a2) ; RV32I-NEXT: lw a1, 4(a2) +; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: sltu a2, a7, a6 ; RV32I-NEXT: mv t4, a2 ; RV32I-NEXT: beq t0, t1, .LBB22_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: slt t4, t1, t0 ; RV32I-NEXT: .LBB22_2: -; RV32I-NEXT: sltu t2, a5, a3 ; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a5, a3 ; RV32I-NEXT: mv t3, t2 ; RV32I-NEXT: beq a4, a1, .LBB22_4 ; RV32I-NEXT: # %bb.3: @@ -1614,18 +1614,18 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) -; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: lw a5, 0(a2) ; RV32ZBB-NEXT: lw a1, 4(a2) +; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: sltu a2, a7, a6 ; RV32ZBB-NEXT: mv t4, a2 ; RV32ZBB-NEXT: beq t0, t1, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: slt t4, t1, t0 ; RV32ZBB-NEXT: .LBB22_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 ; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a5, a3 ; RV32ZBB-NEXT: mv t3, t2 ; RV32ZBB-NEXT: beq a4, a1, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: @@ -2045,27 +2045,27 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a4, 0(a2) ; RV32I-NEXT: lw a3, 4(a2) -; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a6, 12(a2) -; RV32I-NEXT: lw a7, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a7, 12(a2) ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a7, a5 -; RV32I-NEXT: sub t0, t0, a6 -; RV32I-NEXT: sltu a6, a2, a4 -; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: mv t1, a6 -; RV32I-NEXT: beq a1, a3, .LBB31_2 +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw t0, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: sltu t1, t0, a6 +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: sub a1, a1, t1 +; RV32I-NEXT: sltu a7, a2, a4 +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: beq a5, a3, .LBB31_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a1, a3 +; RV32I-NEXT: sltu t1, a5, a3 ; RV32I-NEXT: .LBB31_2: -; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: sub a3, a1, a3 -; RV32I-NEXT: sltu a1, a5, t1 -; RV32I-NEXT: sub a5, a5, t1 -; RV32I-NEXT: sub a1, t0, a1 -; RV32I-NEXT: sub a3, a3, a6 +; RV32I-NEXT: sub a6, t0, a6 +; RV32I-NEXT: sub a3, a5, a3 +; RV32I-NEXT: sltu t0, a6, t1 +; RV32I-NEXT: sub a5, a6, t1 +; RV32I-NEXT: sub a1, a1, t0 +; RV32I-NEXT: sub a3, a3, a7 ; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: bgez a1, .LBB31_4 ; RV32I-NEXT: # %bb.3: @@ -2108,27 +2108,27 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a4, 0(a2) ; RV32ZBB-NEXT: lw a3, 4(a2) -; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a6, 12(a2) -; RV32ZBB-NEXT: lw a7, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a7, 12(a2) ; RV32ZBB-NEXT: lw a2, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a7, a5 -; RV32ZBB-NEXT: sub t0, t0, a6 -; RV32ZBB-NEXT: sltu a6, a2, a4 -; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: mv t1, a6 -; RV32ZBB-NEXT: beq a1, a3, .LBB31_2 +; RV32ZBB-NEXT: lw a5, 4(a1) +; RV32ZBB-NEXT: lw t0, 8(a1) +; RV32ZBB-NEXT: lw a1, 12(a1) +; RV32ZBB-NEXT: sltu t1, t0, a6 +; RV32ZBB-NEXT: sub a1, a1, a7 +; RV32ZBB-NEXT: sub a1, a1, t1 +; RV32ZBB-NEXT: sltu a7, a2, a4 +; RV32ZBB-NEXT: mv t1, a7 +; RV32ZBB-NEXT: beq a5, a3, .LBB31_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a1, a3 +; RV32ZBB-NEXT: sltu t1, a5, a3 ; RV32ZBB-NEXT: .LBB31_2: -; RV32ZBB-NEXT: sub a5, a7, a5 -; RV32ZBB-NEXT: sub a3, a1, a3 -; RV32ZBB-NEXT: sltu a1, a5, t1 -; RV32ZBB-NEXT: sub a5, a5, t1 -; RV32ZBB-NEXT: sub a1, t0, a1 -; RV32ZBB-NEXT: sub a3, a3, a6 +; RV32ZBB-NEXT: sub a6, t0, a6 +; RV32ZBB-NEXT: sub a3, a5, a3 +; RV32ZBB-NEXT: sltu t0, a6, t1 +; RV32ZBB-NEXT: sub a5, a6, t1 +; RV32ZBB-NEXT: sub a1, a1, t0 +; RV32ZBB-NEXT: sub a3, a3, a7 ; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: bgez a1, .LBB31_4 ; RV32ZBB-NEXT: # %bb.3: @@ -2176,27 +2176,27 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a4, 0(a2) ; RV32I-NEXT: lw a3, 4(a2) -; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a6, 12(a2) -; RV32I-NEXT: lw a7, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a7, 12(a2) ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a7, a5 -; RV32I-NEXT: sub t0, t0, a6 -; RV32I-NEXT: sltu a6, a2, a4 -; RV32I-NEXT: sub t0, t0, t1 -; RV32I-NEXT: mv t1, a6 -; RV32I-NEXT: beq a1, a3, .LBB32_2 +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw t0, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: sltu t1, t0, a6 +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: sub a1, a1, t1 +; RV32I-NEXT: sltu a7, a2, a4 +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: beq a5, a3, .LBB32_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a1, a3 +; RV32I-NEXT: sltu t1, a5, a3 ; RV32I-NEXT: .LBB32_2: -; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: sub a3, a1, a3 -; RV32I-NEXT: sltu a1, a5, t1 -; RV32I-NEXT: sub a5, a5, t1 -; RV32I-NEXT: sub a1, t0, a1 -; RV32I-NEXT: sub a3, a3, a6 +; RV32I-NEXT: sub a6, t0, a6 +; RV32I-NEXT: sub a3, a5, a3 +; RV32I-NEXT: sltu t0, a6, t1 +; RV32I-NEXT: sub a5, a6, t1 +; RV32I-NEXT: sub a1, a1, t0 +; RV32I-NEXT: sub a3, a3, a7 ; RV32I-NEXT: sub a2, a2, a4 ; RV32I-NEXT: bgez a1, .LBB32_4 ; RV32I-NEXT: # %bb.3: @@ -2239,27 +2239,27 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a4, 0(a2) ; RV32ZBB-NEXT: lw a3, 4(a2) -; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a6, 12(a2) -; RV32ZBB-NEXT: lw a7, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a7, 12(a2) ; RV32ZBB-NEXT: lw a2, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a7, a5 -; RV32ZBB-NEXT: sub t0, t0, a6 -; RV32ZBB-NEXT: sltu a6, a2, a4 -; RV32ZBB-NEXT: sub t0, t0, t1 -; RV32ZBB-NEXT: mv t1, a6 -; RV32ZBB-NEXT: beq a1, a3, .LBB32_2 +; RV32ZBB-NEXT: lw a5, 4(a1) +; RV32ZBB-NEXT: lw t0, 8(a1) +; RV32ZBB-NEXT: lw a1, 12(a1) +; RV32ZBB-NEXT: sltu t1, t0, a6 +; RV32ZBB-NEXT: sub a1, a1, a7 +; RV32ZBB-NEXT: sub a1, a1, t1 +; RV32ZBB-NEXT: sltu a7, a2, a4 +; RV32ZBB-NEXT: mv t1, a7 +; RV32ZBB-NEXT: beq a5, a3, .LBB32_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a1, a3 +; RV32ZBB-NEXT: sltu t1, a5, a3 ; RV32ZBB-NEXT: .LBB32_2: -; RV32ZBB-NEXT: sub a5, a7, a5 -; RV32ZBB-NEXT: sub a3, a1, a3 -; RV32ZBB-NEXT: sltu a1, a5, t1 -; RV32ZBB-NEXT: sub a5, a5, t1 -; RV32ZBB-NEXT: sub a1, t0, a1 -; RV32ZBB-NEXT: sub a3, a3, a6 +; RV32ZBB-NEXT: sub a6, t0, a6 +; RV32ZBB-NEXT: sub a3, a5, a3 +; RV32ZBB-NEXT: sltu t0, a6, t1 +; RV32ZBB-NEXT: sub a5, a6, t1 +; RV32ZBB-NEXT: sub a1, a1, t0 +; RV32ZBB-NEXT: sub a3, a3, a7 ; RV32ZBB-NEXT: sub a2, a2, a4 ; RV32ZBB-NEXT: bgez a1, .LBB32_4 ; RV32ZBB-NEXT: # %bb.3: @@ -2541,18 +2541,18 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw t0, 12(a1) -; RV32I-NEXT: lw a7, 8(a2) -; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: lw a5, 0(a2) ; RV32I-NEXT: lw a1, 4(a2) +; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw t1, 12(a2) ; RV32I-NEXT: sltu a2, a7, a6 ; RV32I-NEXT: mv t4, a2 ; RV32I-NEXT: beq t0, t1, .LBB38_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: slt t4, t1, t0 ; RV32I-NEXT: .LBB38_2: -; RV32I-NEXT: sltu t2, a5, a3 ; RV32I-NEXT: sltu t5, a1, a4 +; RV32I-NEXT: sltu t2, a5, a3 ; RV32I-NEXT: mv t3, t2 ; RV32I-NEXT: beq a4, a1, .LBB38_4 ; RV32I-NEXT: # %bb.3: @@ -2637,18 +2637,18 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: lw a4, 4(a1) ; RV32ZBB-NEXT: lw a6, 8(a1) ; RV32ZBB-NEXT: lw t0, 12(a1) -; RV32ZBB-NEXT: lw a7, 8(a2) -; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: lw a5, 0(a2) ; RV32ZBB-NEXT: lw a1, 4(a2) +; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw t1, 12(a2) ; RV32ZBB-NEXT: sltu a2, a7, a6 ; RV32ZBB-NEXT: mv t4, a2 ; RV32ZBB-NEXT: beq t0, t1, .LBB38_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: slt t4, t1, t0 ; RV32ZBB-NEXT: .LBB38_2: -; RV32ZBB-NEXT: sltu t2, a5, a3 ; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: sltu t2, a5, a3 ; RV32ZBB-NEXT: mv t3, t2 ; RV32ZBB-NEXT: beq a4, a1, .LBB38_4 ; RV32ZBB-NEXT: # %bb.3: diff --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll index 9e41cde7ae181..a904def2753db 100644 --- a/llvm/test/CodeGen/RISCV/abdu-neg.ll +++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll @@ -624,24 +624,24 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_ext_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a5, 0(a2) +; RV32I-NEXT: lw a4, 0(a2) ; RV32I-NEXT: lw a7, 4(a2) ; RV32I-NEXT: lw a3, 8(a2) ; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a4, 8(a1) -; RV32I-NEXT: lw a6, 12(a1) ; RV32I-NEXT: lw a2, 0(a1) ; RV32I-NEXT: lw t0, 4(a1) -; RV32I-NEXT: sltu a1, a4, a3 +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a6, 12(a1) +; RV32I-NEXT: sltu a1, a5, a3 ; RV32I-NEXT: sub t1, a6, t1 -; RV32I-NEXT: sltu t2, a2, a5 ; RV32I-NEXT: sub a1, t1, a1 +; RV32I-NEXT: sltu t2, a2, a4 ; RV32I-NEXT: mv t1, t2 ; RV32I-NEXT: beq t0, a7, .LBB11_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sltu t1, t0, a7 ; RV32I-NEXT: .LBB11_2: -; RV32I-NEXT: sub a3, a4, a3 +; RV32I-NEXT: sub a3, a5, a3 ; RV32I-NEXT: sltu t3, a3, t1 ; RV32I-NEXT: sub a1, a1, t3 ; RV32I-NEXT: sub a3, a3, t1 @@ -650,27 +650,27 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sltu t1, a6, a1 ; RV32I-NEXT: j .LBB11_5 ; RV32I-NEXT: .LBB11_4: -; RV32I-NEXT: sltu t1, a4, a3 +; RV32I-NEXT: sltu t1, a5, a3 ; RV32I-NEXT: .LBB11_5: ; RV32I-NEXT: sub a7, t0, a7 ; RV32I-NEXT: sub a7, a7, t2 -; RV32I-NEXT: sub a5, a2, a5 +; RV32I-NEXT: sub t2, a2, a4 ; RV32I-NEXT: beq a7, t0, .LBB11_7 ; RV32I-NEXT: # %bb.6: ; RV32I-NEXT: sltu a2, t0, a7 ; RV32I-NEXT: j .LBB11_8 ; RV32I-NEXT: .LBB11_7: -; RV32I-NEXT: sltu a2, a2, a5 +; RV32I-NEXT: sltu a2, a2, t2 ; RV32I-NEXT: .LBB11_8: -; RV32I-NEXT: xor a6, a1, a6 -; RV32I-NEXT: xor a4, a3, a4 -; RV32I-NEXT: or a4, a4, a6 +; RV32I-NEXT: xor a4, a1, a6 +; RV32I-NEXT: xor a5, a3, a5 +; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: beqz a4, .LBB11_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv a2, t1 ; RV32I-NEXT: .LBB11_10: ; RV32I-NEXT: neg a4, a2 -; RV32I-NEXT: xor t0, a5, a4 +; RV32I-NEXT: xor t0, t2, a4 ; RV32I-NEXT: xor t3, a7, a4 ; RV32I-NEXT: sltu a5, t0, a4 ; RV32I-NEXT: add a6, t3, a2 @@ -736,24 +736,24 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; ; RV32ZBB-LABEL: abd_ext_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a5, 0(a2) +; RV32ZBB-NEXT: lw a4, 0(a2) ; RV32ZBB-NEXT: lw a7, 4(a2) ; RV32ZBB-NEXT: lw a3, 8(a2) ; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a4, 8(a1) -; RV32ZBB-NEXT: lw a6, 12(a1) ; RV32ZBB-NEXT: lw a2, 0(a1) ; RV32ZBB-NEXT: lw t0, 4(a1) -; RV32ZBB-NEXT: sltu a1, a4, a3 +; RV32ZBB-NEXT: lw a5, 8(a1) +; RV32ZBB-NEXT: lw a6, 12(a1) +; RV32ZBB-NEXT: sltu a1, a5, a3 ; RV32ZBB-NEXT: sub t1, a6, t1 -; RV32ZBB-NEXT: sltu t2, a2, a5 ; RV32ZBB-NEXT: sub a1, t1, a1 +; RV32ZBB-NEXT: sltu t2, a2, a4 ; RV32ZBB-NEXT: mv t1, t2 ; RV32ZBB-NEXT: beq t0, a7, .LBB11_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: sltu t1, t0, a7 ; RV32ZBB-NEXT: .LBB11_2: -; RV32ZBB-NEXT: sub a3, a4, a3 +; RV32ZBB-NEXT: sub a3, a5, a3 ; RV32ZBB-NEXT: sltu t3, a3, t1 ; RV32ZBB-NEXT: sub a1, a1, t3 ; RV32ZBB-NEXT: sub a3, a3, t1 @@ -762,27 +762,27 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: sltu t1, a6, a1 ; RV32ZBB-NEXT: j .LBB11_5 ; RV32ZBB-NEXT: .LBB11_4: -; RV32ZBB-NEXT: sltu t1, a4, a3 +; RV32ZBB-NEXT: sltu t1, a5, a3 ; RV32ZBB-NEXT: .LBB11_5: ; RV32ZBB-NEXT: sub a7, t0, a7 ; RV32ZBB-NEXT: sub a7, a7, t2 -; RV32ZBB-NEXT: sub a5, a2, a5 +; RV32ZBB-NEXT: sub t2, a2, a4 ; RV32ZBB-NEXT: beq a7, t0, .LBB11_7 ; RV32ZBB-NEXT: # %bb.6: ; RV32ZBB-NEXT: sltu a2, t0, a7 ; RV32ZBB-NEXT: j .LBB11_8 ; RV32ZBB-NEXT: .LBB11_7: -; RV32ZBB-NEXT: sltu a2, a2, a5 +; RV32ZBB-NEXT: sltu a2, a2, t2 ; RV32ZBB-NEXT: .LBB11_8: -; RV32ZBB-NEXT: xor a6, a1, a6 -; RV32ZBB-NEXT: xor a4, a3, a4 -; RV32ZBB-NEXT: or a4, a4, a6 +; RV32ZBB-NEXT: xor a4, a1, a6 +; RV32ZBB-NEXT: xor a5, a3, a5 +; RV32ZBB-NEXT: or a4, a5, a4 ; RV32ZBB-NEXT: beqz a4, .LBB11_10 ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: mv a2, t1 ; RV32ZBB-NEXT: .LBB11_10: ; RV32ZBB-NEXT: neg a4, a2 -; RV32ZBB-NEXT: xor t0, a5, a4 +; RV32ZBB-NEXT: xor t0, t2, a4 ; RV32ZBB-NEXT: xor t3, a7, a4 ; RV32ZBB-NEXT: sltu a5, t0, a4 ; RV32ZBB-NEXT: add a6, t3, a2 @@ -857,24 +857,24 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_ext_i128_undef: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a5, 0(a2) +; RV32I-NEXT: lw a4, 0(a2) ; RV32I-NEXT: lw a7, 4(a2) ; RV32I-NEXT: lw a3, 8(a2) ; RV32I-NEXT: lw t1, 12(a2) -; RV32I-NEXT: lw a4, 8(a1) -; RV32I-NEXT: lw a6, 12(a1) ; RV32I-NEXT: lw a2, 0(a1) ; RV32I-NEXT: lw t0, 4(a1) -; RV32I-NEXT: sltu a1, a4, a3 +; RV32I-NEXT: lw a5, 8(a1) +; RV32I-NEXT: lw a6, 12(a1) +; RV32I-NEXT: sltu a1, a5, a3 ; RV32I-NEXT: sub t1, a6, t1 -; RV32I-NEXT: sltu t2, a2, a5 ; RV32I-NEXT: sub a1, t1, a1 +; RV32I-NEXT: sltu t2, a2, a4 ; RV32I-NEXT: mv t1, t2 ; RV32I-NEXT: beq t0, a7, .LBB12_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sltu t1, t0, a7 ; RV32I-NEXT: .LBB12_2: -; RV32I-NEXT: sub a3, a4, a3 +; RV32I-NEXT: sub a3, a5, a3 ; RV32I-NEXT: sltu t3, a3, t1 ; RV32I-NEXT: sub a1, a1, t3 ; RV32I-NEXT: sub a3, a3, t1 @@ -883,27 +883,27 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: sltu t1, a6, a1 ; RV32I-NEXT: j .LBB12_5 ; RV32I-NEXT: .LBB12_4: -; RV32I-NEXT: sltu t1, a4, a3 +; RV32I-NEXT: sltu t1, a5, a3 ; RV32I-NEXT: .LBB12_5: ; RV32I-NEXT: sub a7, t0, a7 ; RV32I-NEXT: sub a7, a7, t2 -; RV32I-NEXT: sub a5, a2, a5 +; RV32I-NEXT: sub t2, a2, a4 ; RV32I-NEXT: beq a7, t0, .LBB12_7 ; RV32I-NEXT: # %bb.6: ; RV32I-NEXT: sltu a2, t0, a7 ; RV32I-NEXT: j .LBB12_8 ; RV32I-NEXT: .LBB12_7: -; RV32I-NEXT: sltu a2, a2, a5 +; RV32I-NEXT: sltu a2, a2, t2 ; RV32I-NEXT: .LBB12_8: -; RV32I-NEXT: xor a6, a1, a6 -; RV32I-NEXT: xor a4, a3, a4 -; RV32I-NEXT: or a4, a4, a6 +; RV32I-NEXT: xor a4, a1, a6 +; RV32I-NEXT: xor a5, a3, a5 +; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: beqz a4, .LBB12_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv a2, t1 ; RV32I-NEXT: .LBB12_10: ; RV32I-NEXT: neg a4, a2 -; RV32I-NEXT: xor t0, a5, a4 +; RV32I-NEXT: xor t0, t2, a4 ; RV32I-NEXT: xor t3, a7, a4 ; RV32I-NEXT: sltu a5, t0, a4 ; RV32I-NEXT: add a6, t3, a2 @@ -969,24 +969,24 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; ; RV32ZBB-LABEL: abd_ext_i128_undef: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a5, 0(a2) +; RV32ZBB-NEXT: lw a4, 0(a2) ; RV32ZBB-NEXT: lw a7, 4(a2) ; RV32ZBB-NEXT: lw a3, 8(a2) ; RV32ZBB-NEXT: lw t1, 12(a2) -; RV32ZBB-NEXT: lw a4, 8(a1) -; RV32ZBB-NEXT: lw a6, 12(a1) ; RV32ZBB-NEXT: lw a2, 0(a1) ; RV32ZBB-NEXT: lw t0, 4(a1) -; RV32ZBB-NEXT: sltu a1, a4, a3 +; RV32ZBB-NEXT: lw a5, 8(a1) +; RV32ZBB-NEXT: lw a6, 12(a1) +; RV32ZBB-NEXT: sltu a1, a5, a3 ; RV32ZBB-NEXT: sub t1, a6, t1 -; RV32ZBB-NEXT: sltu t2, a2, a5 ; RV32ZBB-NEXT: sub a1, t1, a1 +; RV32ZBB-NEXT: sltu t2, a2, a4 ; RV32ZBB-NEXT: mv t1, t2 ; RV32ZBB-NEXT: beq t0, a7, .LBB12_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: sltu t1, t0, a7 ; RV32ZBB-NEXT: .LBB12_2: -; RV32ZBB-NEXT: sub a3, a4, a3 +; RV32ZBB-NEXT: sub a3, a5, a3 ; RV32ZBB-NEXT: sltu t3, a3, t1 ; RV32ZBB-NEXT: sub a1, a1, t3 ; RV32ZBB-NEXT: sub a3, a3, t1 @@ -995,27 +995,27 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: sltu t1, a6, a1 ; RV32ZBB-NEXT: j .LBB12_5 ; RV32ZBB-NEXT: .LBB12_4: -; RV32ZBB-NEXT: sltu t1, a4, a3 +; RV32ZBB-NEXT: sltu t1, a5, a3 ; RV32ZBB-NEXT: .LBB12_5: ; RV32ZBB-NEXT: sub a7, t0, a7 ; RV32ZBB-NEXT: sub a7, a7, t2 -; RV32ZBB-NEXT: sub a5, a2, a5 +; RV32ZBB-NEXT: sub t2, a2, a4 ; RV32ZBB-NEXT: beq a7, t0, .LBB12_7 ; RV32ZBB-NEXT: # %bb.6: ; RV32ZBB-NEXT: sltu a2, t0, a7 ; RV32ZBB-NEXT: j .LBB12_8 ; RV32ZBB-NEXT: .LBB12_7: -; RV32ZBB-NEXT: sltu a2, a2, a5 +; RV32ZBB-NEXT: sltu a2, a2, t2 ; RV32ZBB-NEXT: .LBB12_8: -; RV32ZBB-NEXT: xor a6, a1, a6 -; RV32ZBB-NEXT: xor a4, a3, a4 -; RV32ZBB-NEXT: or a4, a4, a6 +; RV32ZBB-NEXT: xor a4, a1, a6 +; RV32ZBB-NEXT: xor a5, a3, a5 +; RV32ZBB-NEXT: or a4, a5, a4 ; RV32ZBB-NEXT: beqz a4, .LBB12_10 ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: mv a2, t1 ; RV32ZBB-NEXT: .LBB12_10: ; RV32ZBB-NEXT: neg a4, a2 -; RV32ZBB-NEXT: xor t0, a5, a4 +; RV32ZBB-NEXT: xor t0, t2, a4 ; RV32ZBB-NEXT: xor t3, a7, a4 ; RV32ZBB-NEXT: sltu a5, t0, a4 ; RV32ZBB-NEXT: add a6, t3, a2 @@ -1335,30 +1335,30 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind { define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_minmax_i128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a6, 4(a2) -; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a5, 4(a2) +; RV32I-NEXT: lw a6, 8(a2) ; RV32I-NEXT: lw t0, 12(a2) -; RV32I-NEXT: lw a5, 12(a1) ; RV32I-NEXT: lw a3, 4(a1) ; RV32I-NEXT: lw a4, 8(a1) -; RV32I-NEXT: beq a5, t0, .LBB17_2 +; RV32I-NEXT: lw a7, 12(a1) +; RV32I-NEXT: beq a7, t0, .LBB17_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a5, t0 +; RV32I-NEXT: sltu t1, a7, t0 ; RV32I-NEXT: j .LBB17_3 ; RV32I-NEXT: .LBB17_2: -; RV32I-NEXT: sltu t1, a4, a7 +; RV32I-NEXT: sltu t1, a4, a6 ; RV32I-NEXT: .LBB17_3: ; RV32I-NEXT: lw t2, 0(a2) ; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: beq a3, a6, .LBB17_5 +; RV32I-NEXT: beq a3, a5, .LBB17_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: sltu t6, a3, a6 +; RV32I-NEXT: sltu t6, a3, a5 ; RV32I-NEXT: j .LBB17_6 ; RV32I-NEXT: .LBB17_5: ; RV32I-NEXT: sltu t6, a1, t2 ; RV32I-NEXT: .LBB17_6: -; RV32I-NEXT: xor a2, a5, t0 -; RV32I-NEXT: xor t3, a4, a7 +; RV32I-NEXT: xor a2, a7, t0 +; RV32I-NEXT: xor t3, a4, a6 ; RV32I-NEXT: or t5, t3, a2 ; RV32I-NEXT: beqz t5, .LBB17_8 ; RV32I-NEXT: # %bb.7: @@ -1366,27 +1366,27 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB17_8: ; RV32I-NEXT: mv a2, a1 ; RV32I-NEXT: mv t1, a3 -; RV32I-NEXT: mv t4, a5 +; RV32I-NEXT: mv t4, a7 ; RV32I-NEXT: mv t3, a4 ; RV32I-NEXT: bnez t6, .LBB17_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv a2, t2 -; RV32I-NEXT: mv t1, a6 +; RV32I-NEXT: mv t1, a5 ; RV32I-NEXT: mv t4, t0 -; RV32I-NEXT: mv t3, a7 +; RV32I-NEXT: mv t3, a6 ; RV32I-NEXT: .LBB17_10: -; RV32I-NEXT: beq a5, t0, .LBB17_12 +; RV32I-NEXT: beq a7, t0, .LBB17_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu t6, t0, a5 +; RV32I-NEXT: sltu t6, t0, a7 ; RV32I-NEXT: j .LBB17_13 ; RV32I-NEXT: .LBB17_12: -; RV32I-NEXT: sltu t6, a7, a4 +; RV32I-NEXT: sltu t6, a6, a4 ; RV32I-NEXT: .LBB17_13: ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: beq a3, a6, .LBB17_15 +; RV32I-NEXT: beq a3, a5, .LBB17_15 ; RV32I-NEXT: # %bb.14: -; RV32I-NEXT: sltu s0, a6, a3 +; RV32I-NEXT: sltu s0, a5, a3 ; RV32I-NEXT: bnez t5, .LBB17_16 ; RV32I-NEXT: j .LBB17_17 ; RV32I-NEXT: .LBB17_15: @@ -1398,14 +1398,14 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: bnez s0, .LBB17_19 ; RV32I-NEXT: # %bb.18: ; RV32I-NEXT: mv a1, t2 -; RV32I-NEXT: mv a3, a6 -; RV32I-NEXT: mv a5, t0 -; RV32I-NEXT: mv a4, a7 +; RV32I-NEXT: mv a3, a5 +; RV32I-NEXT: mv a7, t0 +; RV32I-NEXT: mv a4, a6 ; RV32I-NEXT: .LBB17_19: -; RV32I-NEXT: sltu a7, t3, a4 -; RV32I-NEXT: sub a5, t4, a5 +; RV32I-NEXT: sltu a5, t3, a4 +; RV32I-NEXT: sub a6, t4, a7 +; RV32I-NEXT: sub a5, a6, a5 ; RV32I-NEXT: sltu a6, a2, a1 -; RV32I-NEXT: sub a5, a5, a7 ; RV32I-NEXT: mv a7, a6 ; RV32I-NEXT: beq t1, a3, .LBB17_21 ; RV32I-NEXT: # %bb.20: @@ -1462,30 +1462,30 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; ; RV32ZBB-LABEL: abd_minmax_i128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a6, 4(a2) -; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a5, 4(a2) +; RV32ZBB-NEXT: lw a6, 8(a2) ; RV32ZBB-NEXT: lw t0, 12(a2) -; RV32ZBB-NEXT: lw a5, 12(a1) ; RV32ZBB-NEXT: lw a3, 4(a1) ; RV32ZBB-NEXT: lw a4, 8(a1) -; RV32ZBB-NEXT: beq a5, t0, .LBB17_2 +; RV32ZBB-NEXT: lw a7, 12(a1) +; RV32ZBB-NEXT: beq a7, t0, .LBB17_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a5, t0 +; RV32ZBB-NEXT: sltu t1, a7, t0 ; RV32ZBB-NEXT: j .LBB17_3 ; RV32ZBB-NEXT: .LBB17_2: -; RV32ZBB-NEXT: sltu t1, a4, a7 +; RV32ZBB-NEXT: sltu t1, a4, a6 ; RV32ZBB-NEXT: .LBB17_3: ; RV32ZBB-NEXT: lw t2, 0(a2) ; RV32ZBB-NEXT: lw a1, 0(a1) -; RV32ZBB-NEXT: beq a3, a6, .LBB17_5 +; RV32ZBB-NEXT: beq a3, a5, .LBB17_5 ; RV32ZBB-NEXT: # %bb.4: -; RV32ZBB-NEXT: sltu t6, a3, a6 +; RV32ZBB-NEXT: sltu t6, a3, a5 ; RV32ZBB-NEXT: j .LBB17_6 ; RV32ZBB-NEXT: .LBB17_5: ; RV32ZBB-NEXT: sltu t6, a1, t2 ; RV32ZBB-NEXT: .LBB17_6: -; RV32ZBB-NEXT: xor a2, a5, t0 -; RV32ZBB-NEXT: xor t3, a4, a7 +; RV32ZBB-NEXT: xor a2, a7, t0 +; RV32ZBB-NEXT: xor t3, a4, a6 ; RV32ZBB-NEXT: or t5, t3, a2 ; RV32ZBB-NEXT: beqz t5, .LBB17_8 ; RV32ZBB-NEXT: # %bb.7: @@ -1493,27 +1493,27 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB17_8: ; RV32ZBB-NEXT: mv a2, a1 ; RV32ZBB-NEXT: mv t1, a3 -; RV32ZBB-NEXT: mv t4, a5 +; RV32ZBB-NEXT: mv t4, a7 ; RV32ZBB-NEXT: mv t3, a4 ; RV32ZBB-NEXT: bnez t6, .LBB17_10 ; RV32ZBB-NEXT: # %bb.9: ; RV32ZBB-NEXT: mv a2, t2 -; RV32ZBB-NEXT: mv t1, a6 +; RV32ZBB-NEXT: mv t1, a5 ; RV32ZBB-NEXT: mv t4, t0 -; RV32ZBB-NEXT: mv t3, a7 +; RV32ZBB-NEXT: mv t3, a6 ; RV32ZBB-NEXT: .LBB17_10: -; RV32ZBB-NEXT: beq a5, t0, .LBB17_12 +; RV32ZBB-NEXT: beq a7, t0, .LBB17_12 ; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu t6, t0, a5 +; RV32ZBB-NEXT: sltu t6, t0, a7 ; RV32ZBB-NEXT: j .LBB17_13 ; RV32ZBB-NEXT: .LBB17_12: -; RV32ZBB-NEXT: sltu t6, a7, a4 +; RV32ZBB-NEXT: sltu t6, a6, a4 ; RV32ZBB-NEXT: .LBB17_13: ; RV32ZBB-NEXT: addi sp, sp, -16 ; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32ZBB-NEXT: beq a3, a6, .LBB17_15 +; RV32ZBB-NEXT: beq a3, a5, .LBB17_15 ; RV32ZBB-NEXT: # %bb.14: -; RV32ZBB-NEXT: sltu s0, a6, a3 +; RV32ZBB-NEXT: sltu s0, a5, a3 ; RV32ZBB-NEXT: bnez t5, .LBB17_16 ; RV32ZBB-NEXT: j .LBB17_17 ; RV32ZBB-NEXT: .LBB17_15: @@ -1525,14 +1525,14 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: bnez s0, .LBB17_19 ; RV32ZBB-NEXT: # %bb.18: ; RV32ZBB-NEXT: mv a1, t2 -; RV32ZBB-NEXT: mv a3, a6 -; RV32ZBB-NEXT: mv a5, t0 -; RV32ZBB-NEXT: mv a4, a7 +; RV32ZBB-NEXT: mv a3, a5 +; RV32ZBB-NEXT: mv a7, t0 +; RV32ZBB-NEXT: mv a4, a6 ; RV32ZBB-NEXT: .LBB17_19: -; RV32ZBB-NEXT: sltu a7, t3, a4 -; RV32ZBB-NEXT: sub a5, t4, a5 +; RV32ZBB-NEXT: sltu a5, t3, a4 +; RV32ZBB-NEXT: sub a6, t4, a7 +; RV32ZBB-NEXT: sub a5, a6, a5 ; RV32ZBB-NEXT: sltu a6, a2, a1 -; RV32ZBB-NEXT: sub a5, a5, a7 ; RV32ZBB-NEXT: mv a7, a6 ; RV32ZBB-NEXT: beq t1, a3, .LBB17_21 ; RV32ZBB-NEXT: # %bb.20: @@ -1799,26 +1799,26 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) ; RV32I-NEXT: lw a4, 4(a2) -; RV32I-NEXT: lw a5, 8(a2) -; RV32I-NEXT: lw a7, 12(a2) -; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: lw t0, 12(a1) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw t0, 12(a2) ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a6, a5 -; RV32I-NEXT: mv t4, t1 -; RV32I-NEXT: beq t0, a7, .LBB22_2 +; RV32I-NEXT: lw a5, 4(a1) +; RV32I-NEXT: lw a7, 8(a1) +; RV32I-NEXT: lw t1, 12(a1) +; RV32I-NEXT: sltu a1, a7, a6 +; RV32I-NEXT: mv t4, a1 +; RV32I-NEXT: beq t1, t0, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t4, t0, a7 +; RV32I-NEXT: sltu t4, t1, t0 ; RV32I-NEXT: .LBB22_2: ; RV32I-NEXT: sltu t2, a2, a3 ; RV32I-NEXT: mv t3, t2 -; RV32I-NEXT: beq a1, a4, .LBB22_4 +; RV32I-NEXT: beq a5, a4, .LBB22_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t3, a1, a4 +; RV32I-NEXT: sltu t3, a5, a4 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: xor t5, t0, a7 -; RV32I-NEXT: xor t6, a6, a5 +; RV32I-NEXT: xor t5, t1, t0 +; RV32I-NEXT: xor t6, a7, a6 ; RV32I-NEXT: or t5, t6, t5 ; RV32I-NEXT: mv t6, t3 ; RV32I-NEXT: beqz t5, .LBB22_6 @@ -1827,32 +1827,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: .LBB22_6: ; RV32I-NEXT: sltu t4, a3, a2 ; RV32I-NEXT: mv t5, t4 -; RV32I-NEXT: beq a1, a4, .LBB22_8 +; RV32I-NEXT: beq a5, a4, .LBB22_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: sltu t5, a4, a1 +; RV32I-NEXT: sltu t5, a4, a5 ; RV32I-NEXT: .LBB22_8: ; RV32I-NEXT: bnez t6, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu t1, a5, a6 -; RV32I-NEXT: sub a7, a7, t0 -; RV32I-NEXT: sub a5, a5, a6 -; RV32I-NEXT: sub a4, a4, a1 -; RV32I-NEXT: sub a6, a7, t1 -; RV32I-NEXT: sltu a7, a5, t5 -; RV32I-NEXT: sub a1, a5, t5 +; RV32I-NEXT: sltu a1, a6, a7 +; RV32I-NEXT: sub t0, t0, t1 +; RV32I-NEXT: sub a6, a6, a7 +; RV32I-NEXT: sub a4, a4, a5 +; RV32I-NEXT: sub a7, t0, a1 +; RV32I-NEXT: sltu t0, a6, t5 +; RV32I-NEXT: sub a1, a6, t5 ; RV32I-NEXT: sub a5, a4, t4 -; RV32I-NEXT: sub a4, a6, a7 +; RV32I-NEXT: sub a4, a7, t0 ; RV32I-NEXT: sub a2, a3, a2 ; RV32I-NEXT: j .LBB22_11 ; RV32I-NEXT: .LBB22_10: -; RV32I-NEXT: sub a7, t0, a7 -; RV32I-NEXT: sub a5, a6, a5 -; RV32I-NEXT: sub a4, a1, a4 -; RV32I-NEXT: sub a6, a7, t1 -; RV32I-NEXT: sltu a7, a5, t3 -; RV32I-NEXT: sub a1, a5, t3 -; RV32I-NEXT: sub a5, a4, t2 -; RV32I-NEXT: sub a4, a6, a7 +; RV32I-NEXT: sub t0, t1, t0 +; RV32I-NEXT: sub a6, a7, a6 +; RV32I-NEXT: sub a5, a5, a4 +; RV32I-NEXT: sub a4, t0, a1 +; RV32I-NEXT: sltu a7, a6, t3 +; RV32I-NEXT: sub a1, a6, t3 +; RV32I-NEXT: sub a5, a5, t2 +; RV32I-NEXT: sub a4, a4, a7 ; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: .LBB22_11: ; RV32I-NEXT: sw a2, 0(a0) @@ -1886,26 +1886,26 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) ; RV32ZBB-NEXT: lw a4, 4(a2) -; RV32ZBB-NEXT: lw a5, 8(a2) -; RV32ZBB-NEXT: lw a7, 12(a2) -; RV32ZBB-NEXT: lw a6, 8(a1) -; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw t0, 12(a2) ; RV32ZBB-NEXT: lw a2, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a6, a5 -; RV32ZBB-NEXT: mv t4, t1 -; RV32ZBB-NEXT: beq t0, a7, .LBB22_2 +; RV32ZBB-NEXT: lw a5, 4(a1) +; RV32ZBB-NEXT: lw a7, 8(a1) +; RV32ZBB-NEXT: lw t1, 12(a1) +; RV32ZBB-NEXT: sltu a1, a7, a6 +; RV32ZBB-NEXT: mv t4, a1 +; RV32ZBB-NEXT: beq t1, t0, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t4, t0, a7 +; RV32ZBB-NEXT: sltu t4, t1, t0 ; RV32ZBB-NEXT: .LBB22_2: ; RV32ZBB-NEXT: sltu t2, a2, a3 ; RV32ZBB-NEXT: mv t3, t2 -; RV32ZBB-NEXT: beq a1, a4, .LBB22_4 +; RV32ZBB-NEXT: beq a5, a4, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t3, a1, a4 +; RV32ZBB-NEXT: sltu t3, a5, a4 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: xor t5, t0, a7 -; RV32ZBB-NEXT: xor t6, a6, a5 +; RV32ZBB-NEXT: xor t5, t1, t0 +; RV32ZBB-NEXT: xor t6, a7, a6 ; RV32ZBB-NEXT: or t5, t6, t5 ; RV32ZBB-NEXT: mv t6, t3 ; RV32ZBB-NEXT: beqz t5, .LBB22_6 @@ -1914,32 +1914,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-NEXT: .LBB22_6: ; RV32ZBB-NEXT: sltu t4, a3, a2 ; RV32ZBB-NEXT: mv t5, t4 -; RV32ZBB-NEXT: beq a1, a4, .LBB22_8 +; RV32ZBB-NEXT: beq a5, a4, .LBB22_8 ; RV32ZBB-NEXT: # %bb.7: -; RV32ZBB-NEXT: sltu t5, a4, a1 +; RV32ZBB-NEXT: sltu t5, a4, a5 ; RV32ZBB-NEXT: .LBB22_8: ; RV32ZBB-NEXT: bnez t6, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: sltu t1, a5, a6 -; RV32ZBB-NEXT: sub a7, a7, t0 -; RV32ZBB-NEXT: sub a5, a5, a6 -; RV32ZBB-NEXT: sub a4, a4, a1 -; RV32ZBB-NEXT: sub a6, a7, t1 -; RV32ZBB-NEXT: sltu a7, a5, t5 -; RV32ZBB-NEXT: sub a1, a5, t5 +; RV32ZBB-NEXT: sltu a1, a6, a7 +; RV32ZBB-NEXT: sub t0, t0, t1 +; RV32ZBB-NEXT: sub a6, a6, a7 +; RV32ZBB-NEXT: sub a4, a4, a5 +; RV32ZBB-NEXT: sub a7, t0, a1 +; RV32ZBB-NEXT: sltu t0, a6, t5 +; RV32ZBB-NEXT: sub a1, a6, t5 ; RV32ZBB-NEXT: sub a5, a4, t4 -; RV32ZBB-NEXT: sub a4, a6, a7 +; RV32ZBB-NEXT: sub a4, a7, t0 ; RV32ZBB-NEXT: sub a2, a3, a2 ; RV32ZBB-NEXT: j .LBB22_11 ; RV32ZBB-NEXT: .LBB22_10: -; RV32ZBB-NEXT: sub a7, t0, a7 -; RV32ZBB-NEXT: sub a5, a6, a5 -; RV32ZBB-NEXT: sub a4, a1, a4 -; RV32ZBB-NEXT: sub a6, a7, t1 -; RV32ZBB-NEXT: sltu a7, a5, t3 -; RV32ZBB-NEXT: sub a1, a5, t3 -; RV32ZBB-NEXT: sub a5, a4, t2 -; RV32ZBB-NEXT: sub a4, a6, a7 +; RV32ZBB-NEXT: sub t0, t1, t0 +; RV32ZBB-NEXT: sub a6, a7, a6 +; RV32ZBB-NEXT: sub a5, a5, a4 +; RV32ZBB-NEXT: sub a4, t0, a1 +; RV32ZBB-NEXT: sltu a7, a6, t3 +; RV32ZBB-NEXT: sub a1, a6, t3 +; RV32ZBB-NEXT: sub a5, a5, t2 +; RV32ZBB-NEXT: sub a4, a4, a7 ; RV32ZBB-NEXT: sub a2, a2, a3 ; RV32ZBB-NEXT: .LBB22_11: ; RV32ZBB-NEXT: sw a2, 0(a0) diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll index 7c8638cb461e2..899c12a2e128d 100644 --- a/llvm/test/CodeGen/RISCV/abdu.ll +++ b/llvm/test/CodeGen/RISCV/abdu.ll @@ -541,75 +541,75 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_ext_i128: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) -; RV32I-NEXT: lw a5, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a6, 4(a2) +; RV32I-NEXT: lw a5, 8(a2) ; RV32I-NEXT: lw a7, 12(a2) +; RV32I-NEXT: lw t0, 0(a1) +; RV32I-NEXT: lw t1, 4(a1) ; RV32I-NEXT: lw a2, 8(a1) ; RV32I-NEXT: lw a4, 12(a1) -; RV32I-NEXT: lw t0, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a2, a6 +; RV32I-NEXT: sltu a1, a2, a5 ; RV32I-NEXT: sub a7, a4, a7 -; RV32I-NEXT: sltu t2, t0, a3 -; RV32I-NEXT: sub a7, a7, t1 -; RV32I-NEXT: mv t1, t2 -; RV32I-NEXT: beq a1, a5, .LBB11_2 +; RV32I-NEXT: sub a7, a7, a1 +; RV32I-NEXT: sltu a1, t0, a3 +; RV32I-NEXT: mv t2, a1 +; RV32I-NEXT: beq t1, a6, .LBB11_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a1, a5 +; RV32I-NEXT: sltu t2, t1, a6 ; RV32I-NEXT: .LBB11_2: -; RV32I-NEXT: sub t3, a2, a6 -; RV32I-NEXT: sltu a6, t3, t1 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a7, t3, t1 -; RV32I-NEXT: beq a6, a4, .LBB11_4 +; RV32I-NEXT: sub t3, a2, a5 +; RV32I-NEXT: sltu a5, t3, t2 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a7, t3, t2 +; RV32I-NEXT: beq a5, a4, .LBB11_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t1, a4, a6 +; RV32I-NEXT: sltu t2, a4, a5 ; RV32I-NEXT: j .LBB11_5 ; RV32I-NEXT: .LBB11_4: -; RV32I-NEXT: sltu t1, a2, a7 +; RV32I-NEXT: sltu t2, a2, a7 ; RV32I-NEXT: .LBB11_5: -; RV32I-NEXT: sub a5, a1, a5 -; RV32I-NEXT: sub a5, a5, t2 +; RV32I-NEXT: sub a6, t1, a6 +; RV32I-NEXT: sub a6, a6, a1 ; RV32I-NEXT: sub a3, t0, a3 -; RV32I-NEXT: beq a5, a1, .LBB11_7 +; RV32I-NEXT: beq a6, t1, .LBB11_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: sltu a1, a1, a5 +; RV32I-NEXT: sltu a1, t1, a6 ; RV32I-NEXT: j .LBB11_8 ; RV32I-NEXT: .LBB11_7: ; RV32I-NEXT: sltu a1, t0, a3 ; RV32I-NEXT: .LBB11_8: -; RV32I-NEXT: xor a4, a6, a4 +; RV32I-NEXT: xor a4, a5, a4 ; RV32I-NEXT: xor a2, a7, a2 ; RV32I-NEXT: or a2, a2, a4 ; RV32I-NEXT: beqz a2, .LBB11_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: mv a1, t2 ; RV32I-NEXT: .LBB11_10: ; RV32I-NEXT: neg t0, a1 -; RV32I-NEXT: xor a2, a7, t0 -; RV32I-NEXT: xor a6, a6, t0 -; RV32I-NEXT: xor a4, a3, t0 -; RV32I-NEXT: sltu a3, a2, t0 -; RV32I-NEXT: add a7, a6, a1 -; RV32I-NEXT: sltu a6, a4, t0 -; RV32I-NEXT: sub a3, a7, a3 -; RV32I-NEXT: xor t1, a5, t0 -; RV32I-NEXT: mv a7, a6 -; RV32I-NEXT: beqz a5, .LBB11_12 -; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu a7, t1, t0 -; RV32I-NEXT: .LBB11_12: +; RV32I-NEXT: xor a4, a7, t0 +; RV32I-NEXT: xor a2, a5, t0 +; RV32I-NEXT: xor a5, a6, t0 +; RV32I-NEXT: xor a3, a3, t0 +; RV32I-NEXT: sltu a7, a4, t0 ; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: add t1, t1, a1 -; RV32I-NEXT: add a1, a4, a1 -; RV32I-NEXT: sltu a4, a2, a7 ; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a5, t1, a6 -; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sltu a7, a3, t0 +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: beqz a6, .LBB11_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: sltu t1, a5, t0 +; RV32I-NEXT: .LBB11_12: +; RV32I-NEXT: add a4, a4, a1 +; RV32I-NEXT: add a5, a5, a1 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: sltu a3, a4, t1 +; RV32I-NEXT: sub a4, a4, t1 +; RV32I-NEXT: sub a5, a5, a7 +; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_ext_i128: @@ -637,75 +637,75 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-LABEL: abd_ext_i128: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) -; RV32ZBB-NEXT: lw a5, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a6, 4(a2) +; RV32ZBB-NEXT: lw a5, 8(a2) ; RV32ZBB-NEXT: lw a7, 12(a2) +; RV32ZBB-NEXT: lw t0, 0(a1) +; RV32ZBB-NEXT: lw t1, 4(a1) ; RV32ZBB-NEXT: lw a2, 8(a1) ; RV32ZBB-NEXT: lw a4, 12(a1) -; RV32ZBB-NEXT: lw t0, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a2, a6 +; RV32ZBB-NEXT: sltu a1, a2, a5 ; RV32ZBB-NEXT: sub a7, a4, a7 -; RV32ZBB-NEXT: sltu t2, t0, a3 -; RV32ZBB-NEXT: sub a7, a7, t1 -; RV32ZBB-NEXT: mv t1, t2 -; RV32ZBB-NEXT: beq a1, a5, .LBB11_2 +; RV32ZBB-NEXT: sub a7, a7, a1 +; RV32ZBB-NEXT: sltu a1, t0, a3 +; RV32ZBB-NEXT: mv t2, a1 +; RV32ZBB-NEXT: beq t1, a6, .LBB11_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a1, a5 +; RV32ZBB-NEXT: sltu t2, t1, a6 ; RV32ZBB-NEXT: .LBB11_2: -; RV32ZBB-NEXT: sub t3, a2, a6 -; RV32ZBB-NEXT: sltu a6, t3, t1 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a7, t3, t1 -; RV32ZBB-NEXT: beq a6, a4, .LBB11_4 +; RV32ZBB-NEXT: sub t3, a2, a5 +; RV32ZBB-NEXT: sltu a5, t3, t2 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a7, t3, t2 +; RV32ZBB-NEXT: beq a5, a4, .LBB11_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t1, a4, a6 +; RV32ZBB-NEXT: sltu t2, a4, a5 ; RV32ZBB-NEXT: j .LBB11_5 ; RV32ZBB-NEXT: .LBB11_4: -; RV32ZBB-NEXT: sltu t1, a2, a7 +; RV32ZBB-NEXT: sltu t2, a2, a7 ; RV32ZBB-NEXT: .LBB11_5: -; RV32ZBB-NEXT: sub a5, a1, a5 -; RV32ZBB-NEXT: sub a5, a5, t2 +; RV32ZBB-NEXT: sub a6, t1, a6 +; RV32ZBB-NEXT: sub a6, a6, a1 ; RV32ZBB-NEXT: sub a3, t0, a3 -; RV32ZBB-NEXT: beq a5, a1, .LBB11_7 +; RV32ZBB-NEXT: beq a6, t1, .LBB11_7 ; RV32ZBB-NEXT: # %bb.6: -; RV32ZBB-NEXT: sltu a1, a1, a5 +; RV32ZBB-NEXT: sltu a1, t1, a6 ; RV32ZBB-NEXT: j .LBB11_8 ; RV32ZBB-NEXT: .LBB11_7: ; RV32ZBB-NEXT: sltu a1, t0, a3 ; RV32ZBB-NEXT: .LBB11_8: -; RV32ZBB-NEXT: xor a4, a6, a4 +; RV32ZBB-NEXT: xor a4, a5, a4 ; RV32ZBB-NEXT: xor a2, a7, a2 ; RV32ZBB-NEXT: or a2, a2, a4 ; RV32ZBB-NEXT: beqz a2, .LBB11_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: mv a1, t1 +; RV32ZBB-NEXT: mv a1, t2 ; RV32ZBB-NEXT: .LBB11_10: ; RV32ZBB-NEXT: neg t0, a1 -; RV32ZBB-NEXT: xor a2, a7, t0 -; RV32ZBB-NEXT: xor a6, a6, t0 -; RV32ZBB-NEXT: xor a4, a3, t0 -; RV32ZBB-NEXT: sltu a3, a2, t0 -; RV32ZBB-NEXT: add a7, a6, a1 -; RV32ZBB-NEXT: sltu a6, a4, t0 -; RV32ZBB-NEXT: sub a3, a7, a3 -; RV32ZBB-NEXT: xor t1, a5, t0 -; RV32ZBB-NEXT: mv a7, a6 -; RV32ZBB-NEXT: beqz a5, .LBB11_12 -; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu a7, t1, t0 -; RV32ZBB-NEXT: .LBB11_12: +; RV32ZBB-NEXT: xor a4, a7, t0 +; RV32ZBB-NEXT: xor a2, a5, t0 +; RV32ZBB-NEXT: xor a5, a6, t0 +; RV32ZBB-NEXT: xor a3, a3, t0 +; RV32ZBB-NEXT: sltu a7, a4, t0 ; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: add t1, t1, a1 -; RV32ZBB-NEXT: add a1, a4, a1 -; RV32ZBB-NEXT: sltu a4, a2, a7 ; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a5, t1, a6 -; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: sltu a7, a3, t0 +; RV32ZBB-NEXT: mv t1, a7 +; RV32ZBB-NEXT: beqz a6, .LBB11_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sltu t1, a5, t0 +; RV32ZBB-NEXT: .LBB11_12: +; RV32ZBB-NEXT: add a4, a4, a1 +; RV32ZBB-NEXT: add a5, a5, a1 +; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: sltu a3, a4, t1 +; RV32ZBB-NEXT: sub a4, a4, t1 +; RV32ZBB-NEXT: sub a5, a5, a7 +; RV32ZBB-NEXT: sub a2, a2, a3 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_ext_i128: @@ -741,75 +741,75 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_ext_i128_undef: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) -; RV32I-NEXT: lw a5, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a6, 4(a2) +; RV32I-NEXT: lw a5, 8(a2) ; RV32I-NEXT: lw a7, 12(a2) +; RV32I-NEXT: lw t0, 0(a1) +; RV32I-NEXT: lw t1, 4(a1) ; RV32I-NEXT: lw a2, 8(a1) ; RV32I-NEXT: lw a4, 12(a1) -; RV32I-NEXT: lw t0, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a2, a6 +; RV32I-NEXT: sltu a1, a2, a5 ; RV32I-NEXT: sub a7, a4, a7 -; RV32I-NEXT: sltu t2, t0, a3 -; RV32I-NEXT: sub a7, a7, t1 -; RV32I-NEXT: mv t1, t2 -; RV32I-NEXT: beq a1, a5, .LBB12_2 +; RV32I-NEXT: sub a7, a7, a1 +; RV32I-NEXT: sltu a1, t0, a3 +; RV32I-NEXT: mv t2, a1 +; RV32I-NEXT: beq t1, a6, .LBB12_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a1, a5 +; RV32I-NEXT: sltu t2, t1, a6 ; RV32I-NEXT: .LBB12_2: -; RV32I-NEXT: sub t3, a2, a6 -; RV32I-NEXT: sltu a6, t3, t1 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a7, t3, t1 -; RV32I-NEXT: beq a6, a4, .LBB12_4 +; RV32I-NEXT: sub t3, a2, a5 +; RV32I-NEXT: sltu a5, t3, t2 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a7, t3, t2 +; RV32I-NEXT: beq a5, a4, .LBB12_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t1, a4, a6 +; RV32I-NEXT: sltu t2, a4, a5 ; RV32I-NEXT: j .LBB12_5 ; RV32I-NEXT: .LBB12_4: -; RV32I-NEXT: sltu t1, a2, a7 +; RV32I-NEXT: sltu t2, a2, a7 ; RV32I-NEXT: .LBB12_5: -; RV32I-NEXT: sub a5, a1, a5 -; RV32I-NEXT: sub a5, a5, t2 +; RV32I-NEXT: sub a6, t1, a6 +; RV32I-NEXT: sub a6, a6, a1 ; RV32I-NEXT: sub a3, t0, a3 -; RV32I-NEXT: beq a5, a1, .LBB12_7 +; RV32I-NEXT: beq a6, t1, .LBB12_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: sltu a1, a1, a5 +; RV32I-NEXT: sltu a1, t1, a6 ; RV32I-NEXT: j .LBB12_8 ; RV32I-NEXT: .LBB12_7: ; RV32I-NEXT: sltu a1, t0, a3 ; RV32I-NEXT: .LBB12_8: -; RV32I-NEXT: xor a4, a6, a4 +; RV32I-NEXT: xor a4, a5, a4 ; RV32I-NEXT: xor a2, a7, a2 ; RV32I-NEXT: or a2, a2, a4 ; RV32I-NEXT: beqz a2, .LBB12_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: mv a1, t2 ; RV32I-NEXT: .LBB12_10: ; RV32I-NEXT: neg t0, a1 -; RV32I-NEXT: xor a2, a7, t0 -; RV32I-NEXT: xor a6, a6, t0 -; RV32I-NEXT: xor a4, a3, t0 -; RV32I-NEXT: sltu a3, a2, t0 -; RV32I-NEXT: add a7, a6, a1 -; RV32I-NEXT: sltu a6, a4, t0 -; RV32I-NEXT: sub a3, a7, a3 -; RV32I-NEXT: xor t1, a5, t0 -; RV32I-NEXT: mv a7, a6 -; RV32I-NEXT: beqz a5, .LBB12_12 -; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu a7, t1, t0 -; RV32I-NEXT: .LBB12_12: +; RV32I-NEXT: xor a4, a7, t0 +; RV32I-NEXT: xor a2, a5, t0 +; RV32I-NEXT: xor a5, a6, t0 +; RV32I-NEXT: xor a3, a3, t0 +; RV32I-NEXT: sltu a7, a4, t0 ; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: add t1, t1, a1 -; RV32I-NEXT: add a1, a4, a1 -; RV32I-NEXT: sltu a4, a2, a7 ; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a5, t1, a6 -; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sltu a7, a3, t0 +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: beqz a6, .LBB12_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: sltu t1, a5, t0 +; RV32I-NEXT: .LBB12_12: +; RV32I-NEXT: add a4, a4, a1 +; RV32I-NEXT: add a5, a5, a1 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: sltu a3, a4, t1 +; RV32I-NEXT: sub a4, a4, t1 +; RV32I-NEXT: sub a5, a5, a7 +; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_ext_i128_undef: @@ -837,75 +837,75 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; RV32ZBB-LABEL: abd_ext_i128_undef: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) -; RV32ZBB-NEXT: lw a5, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a6, 4(a2) +; RV32ZBB-NEXT: lw a5, 8(a2) ; RV32ZBB-NEXT: lw a7, 12(a2) +; RV32ZBB-NEXT: lw t0, 0(a1) +; RV32ZBB-NEXT: lw t1, 4(a1) ; RV32ZBB-NEXT: lw a2, 8(a1) ; RV32ZBB-NEXT: lw a4, 12(a1) -; RV32ZBB-NEXT: lw t0, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a2, a6 +; RV32ZBB-NEXT: sltu a1, a2, a5 ; RV32ZBB-NEXT: sub a7, a4, a7 -; RV32ZBB-NEXT: sltu t2, t0, a3 -; RV32ZBB-NEXT: sub a7, a7, t1 -; RV32ZBB-NEXT: mv t1, t2 -; RV32ZBB-NEXT: beq a1, a5, .LBB12_2 +; RV32ZBB-NEXT: sub a7, a7, a1 +; RV32ZBB-NEXT: sltu a1, t0, a3 +; RV32ZBB-NEXT: mv t2, a1 +; RV32ZBB-NEXT: beq t1, a6, .LBB12_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a1, a5 +; RV32ZBB-NEXT: sltu t2, t1, a6 ; RV32ZBB-NEXT: .LBB12_2: -; RV32ZBB-NEXT: sub t3, a2, a6 -; RV32ZBB-NEXT: sltu a6, t3, t1 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a7, t3, t1 -; RV32ZBB-NEXT: beq a6, a4, .LBB12_4 +; RV32ZBB-NEXT: sub t3, a2, a5 +; RV32ZBB-NEXT: sltu a5, t3, t2 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a7, t3, t2 +; RV32ZBB-NEXT: beq a5, a4, .LBB12_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t1, a4, a6 +; RV32ZBB-NEXT: sltu t2, a4, a5 ; RV32ZBB-NEXT: j .LBB12_5 ; RV32ZBB-NEXT: .LBB12_4: -; RV32ZBB-NEXT: sltu t1, a2, a7 +; RV32ZBB-NEXT: sltu t2, a2, a7 ; RV32ZBB-NEXT: .LBB12_5: -; RV32ZBB-NEXT: sub a5, a1, a5 -; RV32ZBB-NEXT: sub a5, a5, t2 +; RV32ZBB-NEXT: sub a6, t1, a6 +; RV32ZBB-NEXT: sub a6, a6, a1 ; RV32ZBB-NEXT: sub a3, t0, a3 -; RV32ZBB-NEXT: beq a5, a1, .LBB12_7 +; RV32ZBB-NEXT: beq a6, t1, .LBB12_7 ; RV32ZBB-NEXT: # %bb.6: -; RV32ZBB-NEXT: sltu a1, a1, a5 +; RV32ZBB-NEXT: sltu a1, t1, a6 ; RV32ZBB-NEXT: j .LBB12_8 ; RV32ZBB-NEXT: .LBB12_7: ; RV32ZBB-NEXT: sltu a1, t0, a3 ; RV32ZBB-NEXT: .LBB12_8: -; RV32ZBB-NEXT: xor a4, a6, a4 +; RV32ZBB-NEXT: xor a4, a5, a4 ; RV32ZBB-NEXT: xor a2, a7, a2 ; RV32ZBB-NEXT: or a2, a2, a4 ; RV32ZBB-NEXT: beqz a2, .LBB12_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: mv a1, t1 +; RV32ZBB-NEXT: mv a1, t2 ; RV32ZBB-NEXT: .LBB12_10: ; RV32ZBB-NEXT: neg t0, a1 -; RV32ZBB-NEXT: xor a2, a7, t0 -; RV32ZBB-NEXT: xor a6, a6, t0 -; RV32ZBB-NEXT: xor a4, a3, t0 -; RV32ZBB-NEXT: sltu a3, a2, t0 -; RV32ZBB-NEXT: add a7, a6, a1 -; RV32ZBB-NEXT: sltu a6, a4, t0 -; RV32ZBB-NEXT: sub a3, a7, a3 -; RV32ZBB-NEXT: xor t1, a5, t0 -; RV32ZBB-NEXT: mv a7, a6 -; RV32ZBB-NEXT: beqz a5, .LBB12_12 -; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu a7, t1, t0 -; RV32ZBB-NEXT: .LBB12_12: +; RV32ZBB-NEXT: xor a4, a7, t0 +; RV32ZBB-NEXT: xor a2, a5, t0 +; RV32ZBB-NEXT: xor a5, a6, t0 +; RV32ZBB-NEXT: xor a3, a3, t0 +; RV32ZBB-NEXT: sltu a7, a4, t0 ; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: add t1, t1, a1 -; RV32ZBB-NEXT: add a1, a4, a1 -; RV32ZBB-NEXT: sltu a4, a2, a7 ; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a5, t1, a6 -; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: sltu a7, a3, t0 +; RV32ZBB-NEXT: mv t1, a7 +; RV32ZBB-NEXT: beqz a6, .LBB12_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sltu t1, a5, t0 +; RV32ZBB-NEXT: .LBB12_12: +; RV32ZBB-NEXT: add a4, a4, a1 +; RV32ZBB-NEXT: add a5, a5, a1 +; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: sltu a3, a4, t1 +; RV32ZBB-NEXT: sub a4, a4, t1 +; RV32ZBB-NEXT: sub a5, a5, a7 +; RV32ZBB-NEXT: sub a2, a2, a3 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_ext_i128_undef: @@ -1132,75 +1132,75 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_minmax_i128: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) -; RV32I-NEXT: lw a5, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a6, 4(a2) +; RV32I-NEXT: lw a5, 8(a2) ; RV32I-NEXT: lw a7, 12(a2) +; RV32I-NEXT: lw t0, 0(a1) +; RV32I-NEXT: lw t1, 4(a1) ; RV32I-NEXT: lw a2, 8(a1) ; RV32I-NEXT: lw a4, 12(a1) -; RV32I-NEXT: lw t0, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a2, a6 +; RV32I-NEXT: sltu a1, a2, a5 ; RV32I-NEXT: sub a7, a4, a7 -; RV32I-NEXT: sltu t2, t0, a3 -; RV32I-NEXT: sub a7, a7, t1 -; RV32I-NEXT: mv t1, t2 -; RV32I-NEXT: beq a1, a5, .LBB17_2 +; RV32I-NEXT: sub a7, a7, a1 +; RV32I-NEXT: sltu a1, t0, a3 +; RV32I-NEXT: mv t2, a1 +; RV32I-NEXT: beq t1, a6, .LBB17_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a1, a5 +; RV32I-NEXT: sltu t2, t1, a6 ; RV32I-NEXT: .LBB17_2: -; RV32I-NEXT: sub t3, a2, a6 -; RV32I-NEXT: sltu a6, t3, t1 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a7, t3, t1 -; RV32I-NEXT: beq a6, a4, .LBB17_4 +; RV32I-NEXT: sub t3, a2, a5 +; RV32I-NEXT: sltu a5, t3, t2 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a7, t3, t2 +; RV32I-NEXT: beq a5, a4, .LBB17_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t1, a4, a6 +; RV32I-NEXT: sltu t2, a4, a5 ; RV32I-NEXT: j .LBB17_5 ; RV32I-NEXT: .LBB17_4: -; RV32I-NEXT: sltu t1, a2, a7 +; RV32I-NEXT: sltu t2, a2, a7 ; RV32I-NEXT: .LBB17_5: -; RV32I-NEXT: sub a5, a1, a5 -; RV32I-NEXT: sub a5, a5, t2 +; RV32I-NEXT: sub a6, t1, a6 +; RV32I-NEXT: sub a6, a6, a1 ; RV32I-NEXT: sub a3, t0, a3 -; RV32I-NEXT: beq a5, a1, .LBB17_7 +; RV32I-NEXT: beq a6, t1, .LBB17_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: sltu a1, a1, a5 +; RV32I-NEXT: sltu a1, t1, a6 ; RV32I-NEXT: j .LBB17_8 ; RV32I-NEXT: .LBB17_7: ; RV32I-NEXT: sltu a1, t0, a3 ; RV32I-NEXT: .LBB17_8: -; RV32I-NEXT: xor a4, a6, a4 +; RV32I-NEXT: xor a4, a5, a4 ; RV32I-NEXT: xor a2, a7, a2 ; RV32I-NEXT: or a2, a2, a4 ; RV32I-NEXT: beqz a2, .LBB17_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: mv a1, t2 ; RV32I-NEXT: .LBB17_10: ; RV32I-NEXT: neg t0, a1 -; RV32I-NEXT: xor a2, a7, t0 -; RV32I-NEXT: xor a6, a6, t0 -; RV32I-NEXT: xor a4, a3, t0 -; RV32I-NEXT: sltu a3, a2, t0 -; RV32I-NEXT: add a7, a6, a1 -; RV32I-NEXT: sltu a6, a4, t0 -; RV32I-NEXT: sub a3, a7, a3 -; RV32I-NEXT: xor t1, a5, t0 -; RV32I-NEXT: mv a7, a6 -; RV32I-NEXT: beqz a5, .LBB17_12 -; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu a7, t1, t0 -; RV32I-NEXT: .LBB17_12: +; RV32I-NEXT: xor a4, a7, t0 +; RV32I-NEXT: xor a2, a5, t0 +; RV32I-NEXT: xor a5, a6, t0 +; RV32I-NEXT: xor a3, a3, t0 +; RV32I-NEXT: sltu a7, a4, t0 ; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: add t1, t1, a1 -; RV32I-NEXT: add a1, a4, a1 -; RV32I-NEXT: sltu a4, a2, a7 ; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a5, t1, a6 -; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sltu a7, a3, t0 +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: beqz a6, .LBB17_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: sltu t1, a5, t0 +; RV32I-NEXT: .LBB17_12: +; RV32I-NEXT: add a4, a4, a1 +; RV32I-NEXT: add a5, a5, a1 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: sltu a3, a4, t1 +; RV32I-NEXT: sub a4, a4, t1 +; RV32I-NEXT: sub a5, a5, a7 +; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_minmax_i128: @@ -1228,75 +1228,75 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-LABEL: abd_minmax_i128: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) -; RV32ZBB-NEXT: lw a5, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a6, 4(a2) +; RV32ZBB-NEXT: lw a5, 8(a2) ; RV32ZBB-NEXT: lw a7, 12(a2) +; RV32ZBB-NEXT: lw t0, 0(a1) +; RV32ZBB-NEXT: lw t1, 4(a1) ; RV32ZBB-NEXT: lw a2, 8(a1) ; RV32ZBB-NEXT: lw a4, 12(a1) -; RV32ZBB-NEXT: lw t0, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a2, a6 +; RV32ZBB-NEXT: sltu a1, a2, a5 ; RV32ZBB-NEXT: sub a7, a4, a7 -; RV32ZBB-NEXT: sltu t2, t0, a3 -; RV32ZBB-NEXT: sub a7, a7, t1 -; RV32ZBB-NEXT: mv t1, t2 -; RV32ZBB-NEXT: beq a1, a5, .LBB17_2 +; RV32ZBB-NEXT: sub a7, a7, a1 +; RV32ZBB-NEXT: sltu a1, t0, a3 +; RV32ZBB-NEXT: mv t2, a1 +; RV32ZBB-NEXT: beq t1, a6, .LBB17_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a1, a5 +; RV32ZBB-NEXT: sltu t2, t1, a6 ; RV32ZBB-NEXT: .LBB17_2: -; RV32ZBB-NEXT: sub t3, a2, a6 -; RV32ZBB-NEXT: sltu a6, t3, t1 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a7, t3, t1 -; RV32ZBB-NEXT: beq a6, a4, .LBB17_4 +; RV32ZBB-NEXT: sub t3, a2, a5 +; RV32ZBB-NEXT: sltu a5, t3, t2 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a7, t3, t2 +; RV32ZBB-NEXT: beq a5, a4, .LBB17_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t1, a4, a6 +; RV32ZBB-NEXT: sltu t2, a4, a5 ; RV32ZBB-NEXT: j .LBB17_5 ; RV32ZBB-NEXT: .LBB17_4: -; RV32ZBB-NEXT: sltu t1, a2, a7 +; RV32ZBB-NEXT: sltu t2, a2, a7 ; RV32ZBB-NEXT: .LBB17_5: -; RV32ZBB-NEXT: sub a5, a1, a5 -; RV32ZBB-NEXT: sub a5, a5, t2 +; RV32ZBB-NEXT: sub a6, t1, a6 +; RV32ZBB-NEXT: sub a6, a6, a1 ; RV32ZBB-NEXT: sub a3, t0, a3 -; RV32ZBB-NEXT: beq a5, a1, .LBB17_7 +; RV32ZBB-NEXT: beq a6, t1, .LBB17_7 ; RV32ZBB-NEXT: # %bb.6: -; RV32ZBB-NEXT: sltu a1, a1, a5 +; RV32ZBB-NEXT: sltu a1, t1, a6 ; RV32ZBB-NEXT: j .LBB17_8 ; RV32ZBB-NEXT: .LBB17_7: ; RV32ZBB-NEXT: sltu a1, t0, a3 ; RV32ZBB-NEXT: .LBB17_8: -; RV32ZBB-NEXT: xor a4, a6, a4 +; RV32ZBB-NEXT: xor a4, a5, a4 ; RV32ZBB-NEXT: xor a2, a7, a2 ; RV32ZBB-NEXT: or a2, a2, a4 ; RV32ZBB-NEXT: beqz a2, .LBB17_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: mv a1, t1 +; RV32ZBB-NEXT: mv a1, t2 ; RV32ZBB-NEXT: .LBB17_10: ; RV32ZBB-NEXT: neg t0, a1 -; RV32ZBB-NEXT: xor a2, a7, t0 -; RV32ZBB-NEXT: xor a6, a6, t0 -; RV32ZBB-NEXT: xor a4, a3, t0 -; RV32ZBB-NEXT: sltu a3, a2, t0 -; RV32ZBB-NEXT: add a7, a6, a1 -; RV32ZBB-NEXT: sltu a6, a4, t0 -; RV32ZBB-NEXT: sub a3, a7, a3 -; RV32ZBB-NEXT: xor t1, a5, t0 -; RV32ZBB-NEXT: mv a7, a6 -; RV32ZBB-NEXT: beqz a5, .LBB17_12 -; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu a7, t1, t0 -; RV32ZBB-NEXT: .LBB17_12: +; RV32ZBB-NEXT: xor a4, a7, t0 +; RV32ZBB-NEXT: xor a2, a5, t0 +; RV32ZBB-NEXT: xor a5, a6, t0 +; RV32ZBB-NEXT: xor a3, a3, t0 +; RV32ZBB-NEXT: sltu a7, a4, t0 ; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: add t1, t1, a1 -; RV32ZBB-NEXT: add a1, a4, a1 -; RV32ZBB-NEXT: sltu a4, a2, a7 ; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a5, t1, a6 -; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: sltu a7, a3, t0 +; RV32ZBB-NEXT: mv t1, a7 +; RV32ZBB-NEXT: beqz a6, .LBB17_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sltu t1, a5, t0 +; RV32ZBB-NEXT: .LBB17_12: +; RV32ZBB-NEXT: add a4, a4, a1 +; RV32ZBB-NEXT: add a5, a5, a1 +; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: sltu a3, a4, t1 +; RV32ZBB-NEXT: sub a4, a4, t1 +; RV32ZBB-NEXT: sub a5, a5, a7 +; RV32ZBB-NEXT: sub a2, a2, a3 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_minmax_i128: @@ -1525,75 +1525,75 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_cmp_i128: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) -; RV32I-NEXT: lw a5, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a6, 4(a2) +; RV32I-NEXT: lw a5, 8(a2) ; RV32I-NEXT: lw a7, 12(a2) +; RV32I-NEXT: lw t0, 0(a1) +; RV32I-NEXT: lw t1, 4(a1) ; RV32I-NEXT: lw a2, 8(a1) ; RV32I-NEXT: lw a4, 12(a1) -; RV32I-NEXT: lw t0, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a2, a6 +; RV32I-NEXT: sltu a1, a2, a5 ; RV32I-NEXT: sub a7, a4, a7 -; RV32I-NEXT: sltu t2, t0, a3 -; RV32I-NEXT: sub a7, a7, t1 -; RV32I-NEXT: mv t1, t2 -; RV32I-NEXT: beq a1, a5, .LBB22_2 +; RV32I-NEXT: sub a7, a7, a1 +; RV32I-NEXT: sltu a1, t0, a3 +; RV32I-NEXT: mv t2, a1 +; RV32I-NEXT: beq t1, a6, .LBB22_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a1, a5 +; RV32I-NEXT: sltu t2, t1, a6 ; RV32I-NEXT: .LBB22_2: -; RV32I-NEXT: sub t3, a2, a6 -; RV32I-NEXT: sltu a6, t3, t1 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a7, t3, t1 -; RV32I-NEXT: beq a6, a4, .LBB22_4 +; RV32I-NEXT: sub t3, a2, a5 +; RV32I-NEXT: sltu a5, t3, t2 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a7, t3, t2 +; RV32I-NEXT: beq a5, a4, .LBB22_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t1, a4, a6 +; RV32I-NEXT: sltu t2, a4, a5 ; RV32I-NEXT: j .LBB22_5 ; RV32I-NEXT: .LBB22_4: -; RV32I-NEXT: sltu t1, a2, a7 +; RV32I-NEXT: sltu t2, a2, a7 ; RV32I-NEXT: .LBB22_5: -; RV32I-NEXT: sub a5, a1, a5 -; RV32I-NEXT: sub a5, a5, t2 +; RV32I-NEXT: sub a6, t1, a6 +; RV32I-NEXT: sub a6, a6, a1 ; RV32I-NEXT: sub a3, t0, a3 -; RV32I-NEXT: beq a5, a1, .LBB22_7 +; RV32I-NEXT: beq a6, t1, .LBB22_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: sltu a1, a1, a5 +; RV32I-NEXT: sltu a1, t1, a6 ; RV32I-NEXT: j .LBB22_8 ; RV32I-NEXT: .LBB22_7: ; RV32I-NEXT: sltu a1, t0, a3 ; RV32I-NEXT: .LBB22_8: -; RV32I-NEXT: xor a4, a6, a4 +; RV32I-NEXT: xor a4, a5, a4 ; RV32I-NEXT: xor a2, a7, a2 ; RV32I-NEXT: or a2, a2, a4 ; RV32I-NEXT: beqz a2, .LBB22_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: mv a1, t2 ; RV32I-NEXT: .LBB22_10: ; RV32I-NEXT: neg t0, a1 -; RV32I-NEXT: xor a2, a7, t0 -; RV32I-NEXT: xor a6, a6, t0 -; RV32I-NEXT: xor a4, a3, t0 -; RV32I-NEXT: sltu a3, a2, t0 -; RV32I-NEXT: add a7, a6, a1 -; RV32I-NEXT: sltu a6, a4, t0 -; RV32I-NEXT: sub a3, a7, a3 -; RV32I-NEXT: xor t1, a5, t0 -; RV32I-NEXT: mv a7, a6 -; RV32I-NEXT: beqz a5, .LBB22_12 -; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu a7, t1, t0 -; RV32I-NEXT: .LBB22_12: +; RV32I-NEXT: xor a4, a7, t0 +; RV32I-NEXT: xor a2, a5, t0 +; RV32I-NEXT: xor a5, a6, t0 +; RV32I-NEXT: xor a3, a3, t0 +; RV32I-NEXT: sltu a7, a4, t0 ; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: add t1, t1, a1 -; RV32I-NEXT: add a1, a4, a1 -; RV32I-NEXT: sltu a4, a2, a7 ; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a5, t1, a6 -; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sltu a7, a3, t0 +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: beqz a6, .LBB22_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: sltu t1, a5, t0 +; RV32I-NEXT: .LBB22_12: +; RV32I-NEXT: add a4, a4, a1 +; RV32I-NEXT: add a5, a5, a1 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: sltu a3, a4, t1 +; RV32I-NEXT: sub a4, a4, t1 +; RV32I-NEXT: sub a5, a5, a7 +; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_cmp_i128: @@ -1621,75 +1621,75 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-LABEL: abd_cmp_i128: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) -; RV32ZBB-NEXT: lw a5, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a6, 4(a2) +; RV32ZBB-NEXT: lw a5, 8(a2) ; RV32ZBB-NEXT: lw a7, 12(a2) +; RV32ZBB-NEXT: lw t0, 0(a1) +; RV32ZBB-NEXT: lw t1, 4(a1) ; RV32ZBB-NEXT: lw a2, 8(a1) ; RV32ZBB-NEXT: lw a4, 12(a1) -; RV32ZBB-NEXT: lw t0, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a2, a6 +; RV32ZBB-NEXT: sltu a1, a2, a5 ; RV32ZBB-NEXT: sub a7, a4, a7 -; RV32ZBB-NEXT: sltu t2, t0, a3 -; RV32ZBB-NEXT: sub a7, a7, t1 -; RV32ZBB-NEXT: mv t1, t2 -; RV32ZBB-NEXT: beq a1, a5, .LBB22_2 +; RV32ZBB-NEXT: sub a7, a7, a1 +; RV32ZBB-NEXT: sltu a1, t0, a3 +; RV32ZBB-NEXT: mv t2, a1 +; RV32ZBB-NEXT: beq t1, a6, .LBB22_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a1, a5 +; RV32ZBB-NEXT: sltu t2, t1, a6 ; RV32ZBB-NEXT: .LBB22_2: -; RV32ZBB-NEXT: sub t3, a2, a6 -; RV32ZBB-NEXT: sltu a6, t3, t1 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a7, t3, t1 -; RV32ZBB-NEXT: beq a6, a4, .LBB22_4 +; RV32ZBB-NEXT: sub t3, a2, a5 +; RV32ZBB-NEXT: sltu a5, t3, t2 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a7, t3, t2 +; RV32ZBB-NEXT: beq a5, a4, .LBB22_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t1, a4, a6 +; RV32ZBB-NEXT: sltu t2, a4, a5 ; RV32ZBB-NEXT: j .LBB22_5 ; RV32ZBB-NEXT: .LBB22_4: -; RV32ZBB-NEXT: sltu t1, a2, a7 +; RV32ZBB-NEXT: sltu t2, a2, a7 ; RV32ZBB-NEXT: .LBB22_5: -; RV32ZBB-NEXT: sub a5, a1, a5 -; RV32ZBB-NEXT: sub a5, a5, t2 +; RV32ZBB-NEXT: sub a6, t1, a6 +; RV32ZBB-NEXT: sub a6, a6, a1 ; RV32ZBB-NEXT: sub a3, t0, a3 -; RV32ZBB-NEXT: beq a5, a1, .LBB22_7 +; RV32ZBB-NEXT: beq a6, t1, .LBB22_7 ; RV32ZBB-NEXT: # %bb.6: -; RV32ZBB-NEXT: sltu a1, a1, a5 +; RV32ZBB-NEXT: sltu a1, t1, a6 ; RV32ZBB-NEXT: j .LBB22_8 ; RV32ZBB-NEXT: .LBB22_7: ; RV32ZBB-NEXT: sltu a1, t0, a3 ; RV32ZBB-NEXT: .LBB22_8: -; RV32ZBB-NEXT: xor a4, a6, a4 +; RV32ZBB-NEXT: xor a4, a5, a4 ; RV32ZBB-NEXT: xor a2, a7, a2 ; RV32ZBB-NEXT: or a2, a2, a4 ; RV32ZBB-NEXT: beqz a2, .LBB22_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: mv a1, t1 +; RV32ZBB-NEXT: mv a1, t2 ; RV32ZBB-NEXT: .LBB22_10: ; RV32ZBB-NEXT: neg t0, a1 -; RV32ZBB-NEXT: xor a2, a7, t0 -; RV32ZBB-NEXT: xor a6, a6, t0 -; RV32ZBB-NEXT: xor a4, a3, t0 -; RV32ZBB-NEXT: sltu a3, a2, t0 -; RV32ZBB-NEXT: add a7, a6, a1 -; RV32ZBB-NEXT: sltu a6, a4, t0 -; RV32ZBB-NEXT: sub a3, a7, a3 -; RV32ZBB-NEXT: xor t1, a5, t0 -; RV32ZBB-NEXT: mv a7, a6 -; RV32ZBB-NEXT: beqz a5, .LBB22_12 -; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu a7, t1, t0 -; RV32ZBB-NEXT: .LBB22_12: +; RV32ZBB-NEXT: xor a4, a7, t0 +; RV32ZBB-NEXT: xor a2, a5, t0 +; RV32ZBB-NEXT: xor a5, a6, t0 +; RV32ZBB-NEXT: xor a3, a3, t0 +; RV32ZBB-NEXT: sltu a7, a4, t0 ; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: add t1, t1, a1 -; RV32ZBB-NEXT: add a1, a4, a1 -; RV32ZBB-NEXT: sltu a4, a2, a7 ; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a5, t1, a6 -; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: sltu a7, a3, t0 +; RV32ZBB-NEXT: mv t1, a7 +; RV32ZBB-NEXT: beqz a6, .LBB22_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sltu t1, a5, t0 +; RV32ZBB-NEXT: .LBB22_12: +; RV32ZBB-NEXT: add a4, a4, a1 +; RV32ZBB-NEXT: add a5, a5, a1 +; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: sltu a3, a4, t1 +; RV32ZBB-NEXT: sub a4, a4, t1 +; RV32ZBB-NEXT: sub a5, a5, a7 +; RV32ZBB-NEXT: sub a2, a2, a3 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_cmp_i128: @@ -1919,75 +1919,75 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32I-LABEL: abd_select_i128: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 0(a2) -; RV32I-NEXT: lw a5, 4(a2) -; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw a6, 4(a2) +; RV32I-NEXT: lw a5, 8(a2) ; RV32I-NEXT: lw a7, 12(a2) +; RV32I-NEXT: lw t0, 0(a1) +; RV32I-NEXT: lw t1, 4(a1) ; RV32I-NEXT: lw a2, 8(a1) ; RV32I-NEXT: lw a4, 12(a1) -; RV32I-NEXT: lw t0, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: sltu t1, a2, a6 +; RV32I-NEXT: sltu a1, a2, a5 ; RV32I-NEXT: sub a7, a4, a7 -; RV32I-NEXT: sltu t2, t0, a3 -; RV32I-NEXT: sub a7, a7, t1 -; RV32I-NEXT: mv t1, t2 -; RV32I-NEXT: beq a1, a5, .LBB27_2 +; RV32I-NEXT: sub a7, a7, a1 +; RV32I-NEXT: sltu a1, t0, a3 +; RV32I-NEXT: mv t2, a1 +; RV32I-NEXT: beq t1, a6, .LBB27_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t1, a1, a5 +; RV32I-NEXT: sltu t2, t1, a6 ; RV32I-NEXT: .LBB27_2: -; RV32I-NEXT: sub t3, a2, a6 -; RV32I-NEXT: sltu a6, t3, t1 -; RV32I-NEXT: sub a6, a7, a6 -; RV32I-NEXT: sub a7, t3, t1 -; RV32I-NEXT: beq a6, a4, .LBB27_4 +; RV32I-NEXT: sub t3, a2, a5 +; RV32I-NEXT: sltu a5, t3, t2 +; RV32I-NEXT: sub a5, a7, a5 +; RV32I-NEXT: sub a7, t3, t2 +; RV32I-NEXT: beq a5, a4, .LBB27_4 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: sltu t1, a4, a6 +; RV32I-NEXT: sltu t2, a4, a5 ; RV32I-NEXT: j .LBB27_5 ; RV32I-NEXT: .LBB27_4: -; RV32I-NEXT: sltu t1, a2, a7 +; RV32I-NEXT: sltu t2, a2, a7 ; RV32I-NEXT: .LBB27_5: -; RV32I-NEXT: sub a5, a1, a5 -; RV32I-NEXT: sub a5, a5, t2 +; RV32I-NEXT: sub a6, t1, a6 +; RV32I-NEXT: sub a6, a6, a1 ; RV32I-NEXT: sub a3, t0, a3 -; RV32I-NEXT: beq a5, a1, .LBB27_7 +; RV32I-NEXT: beq a6, t1, .LBB27_7 ; RV32I-NEXT: # %bb.6: -; RV32I-NEXT: sltu a1, a1, a5 +; RV32I-NEXT: sltu a1, t1, a6 ; RV32I-NEXT: j .LBB27_8 ; RV32I-NEXT: .LBB27_7: ; RV32I-NEXT: sltu a1, t0, a3 ; RV32I-NEXT: .LBB27_8: -; RV32I-NEXT: xor a4, a6, a4 +; RV32I-NEXT: xor a4, a5, a4 ; RV32I-NEXT: xor a2, a7, a2 ; RV32I-NEXT: or a2, a2, a4 ; RV32I-NEXT: beqz a2, .LBB27_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a1, t1 +; RV32I-NEXT: mv a1, t2 ; RV32I-NEXT: .LBB27_10: ; RV32I-NEXT: neg t0, a1 -; RV32I-NEXT: xor a2, a7, t0 -; RV32I-NEXT: xor a6, a6, t0 -; RV32I-NEXT: xor a4, a3, t0 -; RV32I-NEXT: sltu a3, a2, t0 -; RV32I-NEXT: add a7, a6, a1 -; RV32I-NEXT: sltu a6, a4, t0 -; RV32I-NEXT: sub a3, a7, a3 -; RV32I-NEXT: xor t1, a5, t0 -; RV32I-NEXT: mv a7, a6 -; RV32I-NEXT: beqz a5, .LBB27_12 -; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: sltu a7, t1, t0 -; RV32I-NEXT: .LBB27_12: +; RV32I-NEXT: xor a4, a7, t0 +; RV32I-NEXT: xor a2, a5, t0 +; RV32I-NEXT: xor a5, a6, t0 +; RV32I-NEXT: xor a3, a3, t0 +; RV32I-NEXT: sltu a7, a4, t0 ; RV32I-NEXT: add a2, a2, a1 -; RV32I-NEXT: add t1, t1, a1 -; RV32I-NEXT: add a1, a4, a1 -; RV32I-NEXT: sltu a4, a2, a7 ; RV32I-NEXT: sub a2, a2, a7 -; RV32I-NEXT: sub a5, t1, a6 -; RV32I-NEXT: sub a3, a3, a4 +; RV32I-NEXT: sltu a7, a3, t0 +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: beqz a6, .LBB27_12 +; RV32I-NEXT: # %bb.11: +; RV32I-NEXT: sltu t1, a5, t0 +; RV32I-NEXT: .LBB27_12: +; RV32I-NEXT: add a4, a4, a1 +; RV32I-NEXT: add a5, a5, a1 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: sltu a3, a4, t1 +; RV32I-NEXT: sub a4, a4, t1 +; RV32I-NEXT: sub a5, a5, a7 +; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: sw a5, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a2, 12(a0) ; RV32I-NEXT: ret ; ; RV64I-LABEL: abd_select_i128: @@ -2015,75 +2015,75 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; RV32ZBB-LABEL: abd_select_i128: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: lw a3, 0(a2) -; RV32ZBB-NEXT: lw a5, 4(a2) -; RV32ZBB-NEXT: lw a6, 8(a2) +; RV32ZBB-NEXT: lw a6, 4(a2) +; RV32ZBB-NEXT: lw a5, 8(a2) ; RV32ZBB-NEXT: lw a7, 12(a2) +; RV32ZBB-NEXT: lw t0, 0(a1) +; RV32ZBB-NEXT: lw t1, 4(a1) ; RV32ZBB-NEXT: lw a2, 8(a1) ; RV32ZBB-NEXT: lw a4, 12(a1) -; RV32ZBB-NEXT: lw t0, 0(a1) -; RV32ZBB-NEXT: lw a1, 4(a1) -; RV32ZBB-NEXT: sltu t1, a2, a6 +; RV32ZBB-NEXT: sltu a1, a2, a5 ; RV32ZBB-NEXT: sub a7, a4, a7 -; RV32ZBB-NEXT: sltu t2, t0, a3 -; RV32ZBB-NEXT: sub a7, a7, t1 -; RV32ZBB-NEXT: mv t1, t2 -; RV32ZBB-NEXT: beq a1, a5, .LBB27_2 +; RV32ZBB-NEXT: sub a7, a7, a1 +; RV32ZBB-NEXT: sltu a1, t0, a3 +; RV32ZBB-NEXT: mv t2, a1 +; RV32ZBB-NEXT: beq t1, a6, .LBB27_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sltu t1, a1, a5 +; RV32ZBB-NEXT: sltu t2, t1, a6 ; RV32ZBB-NEXT: .LBB27_2: -; RV32ZBB-NEXT: sub t3, a2, a6 -; RV32ZBB-NEXT: sltu a6, t3, t1 -; RV32ZBB-NEXT: sub a6, a7, a6 -; RV32ZBB-NEXT: sub a7, t3, t1 -; RV32ZBB-NEXT: beq a6, a4, .LBB27_4 +; RV32ZBB-NEXT: sub t3, a2, a5 +; RV32ZBB-NEXT: sltu a5, t3, t2 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: sub a7, t3, t2 +; RV32ZBB-NEXT: beq a5, a4, .LBB27_4 ; RV32ZBB-NEXT: # %bb.3: -; RV32ZBB-NEXT: sltu t1, a4, a6 +; RV32ZBB-NEXT: sltu t2, a4, a5 ; RV32ZBB-NEXT: j .LBB27_5 ; RV32ZBB-NEXT: .LBB27_4: -; RV32ZBB-NEXT: sltu t1, a2, a7 +; RV32ZBB-NEXT: sltu t2, a2, a7 ; RV32ZBB-NEXT: .LBB27_5: -; RV32ZBB-NEXT: sub a5, a1, a5 -; RV32ZBB-NEXT: sub a5, a5, t2 +; RV32ZBB-NEXT: sub a6, t1, a6 +; RV32ZBB-NEXT: sub a6, a6, a1 ; RV32ZBB-NEXT: sub a3, t0, a3 -; RV32ZBB-NEXT: beq a5, a1, .LBB27_7 +; RV32ZBB-NEXT: beq a6, t1, .LBB27_7 ; RV32ZBB-NEXT: # %bb.6: -; RV32ZBB-NEXT: sltu a1, a1, a5 +; RV32ZBB-NEXT: sltu a1, t1, a6 ; RV32ZBB-NEXT: j .LBB27_8 ; RV32ZBB-NEXT: .LBB27_7: ; RV32ZBB-NEXT: sltu a1, t0, a3 ; RV32ZBB-NEXT: .LBB27_8: -; RV32ZBB-NEXT: xor a4, a6, a4 +; RV32ZBB-NEXT: xor a4, a5, a4 ; RV32ZBB-NEXT: xor a2, a7, a2 ; RV32ZBB-NEXT: or a2, a2, a4 ; RV32ZBB-NEXT: beqz a2, .LBB27_10 ; RV32ZBB-NEXT: # %bb.9: -; RV32ZBB-NEXT: mv a1, t1 +; RV32ZBB-NEXT: mv a1, t2 ; RV32ZBB-NEXT: .LBB27_10: ; RV32ZBB-NEXT: neg t0, a1 -; RV32ZBB-NEXT: xor a2, a7, t0 -; RV32ZBB-NEXT: xor a6, a6, t0 -; RV32ZBB-NEXT: xor a4, a3, t0 -; RV32ZBB-NEXT: sltu a3, a2, t0 -; RV32ZBB-NEXT: add a7, a6, a1 -; RV32ZBB-NEXT: sltu a6, a4, t0 -; RV32ZBB-NEXT: sub a3, a7, a3 -; RV32ZBB-NEXT: xor t1, a5, t0 -; RV32ZBB-NEXT: mv a7, a6 -; RV32ZBB-NEXT: beqz a5, .LBB27_12 -; RV32ZBB-NEXT: # %bb.11: -; RV32ZBB-NEXT: sltu a7, t1, t0 -; RV32ZBB-NEXT: .LBB27_12: +; RV32ZBB-NEXT: xor a4, a7, t0 +; RV32ZBB-NEXT: xor a2, a5, t0 +; RV32ZBB-NEXT: xor a5, a6, t0 +; RV32ZBB-NEXT: xor a3, a3, t0 +; RV32ZBB-NEXT: sltu a7, a4, t0 ; RV32ZBB-NEXT: add a2, a2, a1 -; RV32ZBB-NEXT: add t1, t1, a1 -; RV32ZBB-NEXT: add a1, a4, a1 -; RV32ZBB-NEXT: sltu a4, a2, a7 ; RV32ZBB-NEXT: sub a2, a2, a7 -; RV32ZBB-NEXT: sub a5, t1, a6 -; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: sltu a7, a3, t0 +; RV32ZBB-NEXT: mv t1, a7 +; RV32ZBB-NEXT: beqz a6, .LBB27_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sltu t1, a5, t0 +; RV32ZBB-NEXT: .LBB27_12: +; RV32ZBB-NEXT: add a4, a4, a1 +; RV32ZBB-NEXT: add a5, a5, a1 +; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: sltu a3, a4, t1 +; RV32ZBB-NEXT: sub a4, a4, t1 +; RV32ZBB-NEXT: sub a5, a5, a7 +; RV32ZBB-NEXT: sub a2, a2, a3 ; RV32ZBB-NEXT: sw a1, 0(a0) ; RV32ZBB-NEXT: sw a5, 4(a0) -; RV32ZBB-NEXT: sw a2, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: abd_select_i128: diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll index 5d4478f9d4b5f..533482e9fdeb4 100644 --- a/llvm/test/CodeGen/RISCV/add-before-shl.ll +++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll @@ -200,26 +200,26 @@ define i128 @add_wide_operand(i128 %a) nounwind { ; ; RV32C-LABEL: add_wide_operand: ; RV32C: # %bb.0: -; RV32C-NEXT: c.lw a4, 12(a1) -; RV32C-NEXT: c.lw a3, 0(a1) +; RV32C-NEXT: lw a6, 0(a1) ; RV32C-NEXT: c.lw a2, 4(a1) -; RV32C-NEXT: c.lw a1, 8(a1) +; RV32C-NEXT: c.lw a4, 8(a1) +; RV32C-NEXT: c.lw a1, 12(a1) ; RV32C-NEXT: c.lui a5, 16 -; RV32C-NEXT: add a6, a4, a5 -; RV32C-NEXT: srli a5, a3, 29 -; RV32C-NEXT: slli a4, a2, 3 -; RV32C-NEXT: c.or a4, a5 -; RV32C-NEXT: srli a5, a1, 29 +; RV32C-NEXT: c.add a1, a5 +; RV32C-NEXT: srli a5, a6, 29 +; RV32C-NEXT: slli a3, a2, 3 +; RV32C-NEXT: c.or a3, a5 +; RV32C-NEXT: srli a5, a4, 29 ; RV32C-NEXT: c.srli a2, 29 -; RV32C-NEXT: c.slli a1, 3 -; RV32C-NEXT: c.slli a3, 3 +; RV32C-NEXT: c.slli a4, 3 ; RV32C-NEXT: c.slli a6, 3 -; RV32C-NEXT: c.or a1, a2 -; RV32C-NEXT: or a2, a6, a5 -; RV32C-NEXT: c.sw a3, 0(a0) -; RV32C-NEXT: c.sw a4, 4(a0) -; RV32C-NEXT: c.sw a1, 8(a0) -; RV32C-NEXT: c.sw a2, 12(a0) +; RV32C-NEXT: c.slli a1, 3 +; RV32C-NEXT: c.or a2, a4 +; RV32C-NEXT: c.or a1, a5 +; RV32C-NEXT: sw a6, 0(a0) +; RV32C-NEXT: c.sw a3, 4(a0) +; RV32C-NEXT: c.sw a2, 8(a0) +; RV32C-NEXT: c.sw a1, 12(a0) ; RV32C-NEXT: c.jr ra ; ; RV64C-LABEL: add_wide_operand: diff --git a/llvm/test/CodeGen/RISCV/add-imm.ll b/llvm/test/CodeGen/RISCV/add-imm.ll index 84deb4c00ac8d..21597beb0c483 100644 --- a/llvm/test/CodeGen/RISCV/add-imm.ll +++ b/llvm/test/CodeGen/RISCV/add-imm.ll @@ -214,28 +214,28 @@ define void @add32_reject() nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lui a0, %hi(ga) ; RV32I-NEXT: lui a1, %hi(gb) -; RV32I-NEXT: lw a2, %lo(ga)(a0) -; RV32I-NEXT: lw a3, %lo(gb)(a1) -; RV32I-NEXT: lui a4, 1 -; RV32I-NEXT: addi a4, a4, -1096 -; RV32I-NEXT: add a2, a2, a4 -; RV32I-NEXT: add a3, a3, a4 -; RV32I-NEXT: sw a2, %lo(ga)(a0) -; RV32I-NEXT: sw a3, %lo(gb)(a1) +; RV32I-NEXT: lui a2, 1 +; RV32I-NEXT: lw a3, %lo(ga)(a0) +; RV32I-NEXT: lw a4, %lo(gb)(a1) +; RV32I-NEXT: addi a2, a2, -1096 +; RV32I-NEXT: add a3, a3, a2 +; RV32I-NEXT: add a2, a4, a2 +; RV32I-NEXT: sw a3, %lo(ga)(a0) +; RV32I-NEXT: sw a2, %lo(gb)(a1) ; RV32I-NEXT: ret ; ; RV64I-LABEL: add32_reject: ; RV64I: # %bb.0: ; RV64I-NEXT: lui a0, %hi(ga) ; RV64I-NEXT: lui a1, %hi(gb) -; RV64I-NEXT: lw a2, %lo(ga)(a0) -; RV64I-NEXT: lw a3, %lo(gb)(a1) -; RV64I-NEXT: lui a4, 1 -; RV64I-NEXT: addi a4, a4, -1096 -; RV64I-NEXT: add a2, a2, a4 -; RV64I-NEXT: add a3, a3, a4 -; RV64I-NEXT: sw a2, %lo(ga)(a0) -; RV64I-NEXT: sw a3, %lo(gb)(a1) +; RV64I-NEXT: lui a2, 1 +; RV64I-NEXT: lw a3, %lo(ga)(a0) +; RV64I-NEXT: lw a4, %lo(gb)(a1) +; RV64I-NEXT: addi a2, a2, -1096 +; RV64I-NEXT: add a3, a3, a2 +; RV64I-NEXT: add a2, a4, a2 +; RV64I-NEXT: sw a3, %lo(ga)(a0) +; RV64I-NEXT: sw a2, %lo(gb)(a1) ; RV64I-NEXT: ret %1 = load i32, ptr @ga, align 4 %2 = load i32, ptr @gb, align 4 diff --git a/llvm/test/CodeGen/RISCV/alloca.ll b/llvm/test/CodeGen/RISCV/alloca.ll index 975fc93c830af..2463cd229ee7d 100644 --- a/llvm/test/CodeGen/RISCV/alloca.ll +++ b/llvm/test/CodeGen/RISCV/alloca.ll @@ -76,21 +76,21 @@ define void @alloca_callframe(i32 %n) nounwind { ; RV32I-NEXT: sub a0, sp, a0 ; RV32I-NEXT: mv sp, a0 ; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: li t0, 12 -; RV32I-NEXT: li t1, 11 -; RV32I-NEXT: li t2, 10 -; RV32I-NEXT: li t3, 9 +; RV32I-NEXT: li a7, 12 +; RV32I-NEXT: li t0, 11 +; RV32I-NEXT: li t1, 10 +; RV32I-NEXT: li t2, 9 ; RV32I-NEXT: li a1, 2 ; RV32I-NEXT: li a2, 3 ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 6 ; RV32I-NEXT: li a6, 7 +; RV32I-NEXT: sw t2, 0(sp) +; RV32I-NEXT: sw t1, 4(sp) +; RV32I-NEXT: sw t0, 8(sp) +; RV32I-NEXT: sw a7, 12(sp) ; RV32I-NEXT: li a7, 8 -; RV32I-NEXT: sw t3, 0(sp) -; RV32I-NEXT: sw t2, 4(sp) -; RV32I-NEXT: sw t1, 8(sp) -; RV32I-NEXT: sw t0, 12(sp) ; RV32I-NEXT: call func ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: addi sp, s0, -16 diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll index f032756e007b6..8d393e894e69d 100644 --- a/llvm/test/CodeGen/RISCV/alu64.ll +++ b/llvm/test/CodeGen/RISCV/alu64.ll @@ -206,8 +206,8 @@ define i64 @sll(i64 %a, i64 %b) nounwind { ; ; RV32I-LABEL: sll: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: sll a3, a0, a2 +; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: bltz a4, .LBB11_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a1, a3 @@ -293,8 +293,8 @@ define i64 @srl(i64 %a, i64 %b) nounwind { ; ; RV32I-LABEL: srl: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: srl a3, a1, a2 +; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: bltz a4, .LBB15_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a3 @@ -322,13 +322,12 @@ define i64 @sra(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: sra: ; RV32I: # %bb.0: ; RV32I-NEXT: mv a3, a1 -; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: sra a1, a1, a2 +; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: bltz a4, .LBB16_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a3, a3, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: srai a1, a3, 31 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB16_2: ; RV32I-NEXT: srl a0, a0, a2 diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll index 8534ad379ebab..4abc125ce58eb 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll @@ -192,41 +192,41 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a2 -; RV32-NEXT: mv s1, a0 -; RV32-NEXT: lw a4, 0(a0) -; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: mv s2, a1 +; RV32-NEXT: mv s1, a1 +; RV32-NEXT: mv s2, a0 +; RV32-NEXT: lw a1, 0(a0) +; RV32-NEXT: lw a4, 4(a0) ; RV32-NEXT: j .LBB11_2 ; RV32-NEXT: .LBB11_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB11_2 Depth=1 -; RV32-NEXT: sw a4, 8(sp) -; RV32-NEXT: sw a5, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a4, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 -; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a0, s2 ; RV32-NEXT: call __atomic_compare_exchange_8 -; RV32-NEXT: lw a4, 8(sp) -; RV32-NEXT: lw a5, 12(sp) +; RV32-NEXT: lw a1, 8(sp) +; RV32-NEXT: lw a4, 12(sp) ; RV32-NEXT: bnez a0, .LBB11_6 ; RV32-NEXT: .LBB11_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a5, s0, .LBB11_4 +; RV32-NEXT: beq a4, s0, .LBB11_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB11_2 Depth=1 -; RV32-NEXT: slt a0, s0, a5 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: mv a3, a5 +; RV32-NEXT: slt a0, s0, a4 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: mv a3, a4 ; RV32-NEXT: bnez a0, .LBB11_1 ; RV32-NEXT: j .LBB11_5 ; RV32-NEXT: .LBB11_4: # in Loop: Header=BB11_2 Depth=1 -; RV32-NEXT: sltu a0, s2, a4 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: mv a3, a5 +; RV32-NEXT: sltu a0, s1, a1 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: mv a3, a4 ; RV32-NEXT: bnez a0, .LBB11_1 ; RV32-NEXT: .LBB11_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB11_2 Depth=1 -; RV32-NEXT: mv a2, s2 +; RV32-NEXT: mv a2, s1 ; RV32-NEXT: mv a3, s0 ; RV32-NEXT: j .LBB11_1 ; RV32-NEXT: .LBB11_6: # %atomicrmw.end @@ -268,41 +268,41 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a2 -; RV32-NEXT: mv s1, a0 -; RV32-NEXT: lw a4, 0(a0) -; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: mv s2, a1 +; RV32-NEXT: mv s1, a1 +; RV32-NEXT: mv s2, a0 +; RV32-NEXT: lw a1, 0(a0) +; RV32-NEXT: lw a4, 4(a0) ; RV32-NEXT: j .LBB13_2 ; RV32-NEXT: .LBB13_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB13_2 Depth=1 -; RV32-NEXT: sw a4, 8(sp) -; RV32-NEXT: sw a5, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a4, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 -; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a0, s2 ; RV32-NEXT: call __atomic_compare_exchange_8 -; RV32-NEXT: lw a4, 8(sp) -; RV32-NEXT: lw a5, 12(sp) +; RV32-NEXT: lw a1, 8(sp) +; RV32-NEXT: lw a4, 12(sp) ; RV32-NEXT: bnez a0, .LBB13_6 ; RV32-NEXT: .LBB13_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a5, s0, .LBB13_4 +; RV32-NEXT: beq a4, s0, .LBB13_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB13_2 Depth=1 -; RV32-NEXT: sltu a0, s0, a5 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: mv a3, a5 +; RV32-NEXT: sltu a0, s0, a4 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: mv a3, a4 ; RV32-NEXT: bnez a0, .LBB13_1 ; RV32-NEXT: j .LBB13_5 ; RV32-NEXT: .LBB13_4: # in Loop: Header=BB13_2 Depth=1 -; RV32-NEXT: sltu a0, s2, a4 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: mv a3, a5 +; RV32-NEXT: sltu a0, s1, a1 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: mv a3, a4 ; RV32-NEXT: bnez a0, .LBB13_1 ; RV32-NEXT: .LBB13_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB13_2 Depth=1 -; RV32-NEXT: mv a2, s2 +; RV32-NEXT: mv a2, s1 ; RV32-NEXT: mv a3, s0 ; RV32-NEXT: j .LBB13_1 ; RV32-NEXT: .LBB13_6: # %atomicrmw.end @@ -344,41 +344,41 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a2 -; RV32-NEXT: mv s1, a0 -; RV32-NEXT: lw a4, 0(a0) -; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: mv s2, a1 +; RV32-NEXT: mv s1, a1 +; RV32-NEXT: mv s2, a0 +; RV32-NEXT: lw a1, 0(a0) +; RV32-NEXT: lw a4, 4(a0) ; RV32-NEXT: j .LBB15_2 ; RV32-NEXT: .LBB15_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB15_2 Depth=1 -; RV32-NEXT: sw a4, 8(sp) -; RV32-NEXT: sw a5, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a4, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 -; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a0, s2 ; RV32-NEXT: call __atomic_compare_exchange_8 -; RV32-NEXT: lw a4, 8(sp) -; RV32-NEXT: lw a5, 12(sp) +; RV32-NEXT: lw a1, 8(sp) +; RV32-NEXT: lw a4, 12(sp) ; RV32-NEXT: bnez a0, .LBB15_6 ; RV32-NEXT: .LBB15_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a5, s0, .LBB15_4 +; RV32-NEXT: beq a4, s0, .LBB15_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB15_2 Depth=1 -; RV32-NEXT: slt a0, s0, a5 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: mv a3, a5 +; RV32-NEXT: slt a0, s0, a4 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: mv a3, a4 ; RV32-NEXT: beqz a0, .LBB15_1 ; RV32-NEXT: j .LBB15_5 ; RV32-NEXT: .LBB15_4: # in Loop: Header=BB15_2 Depth=1 -; RV32-NEXT: sltu a0, s2, a4 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: mv a3, a5 +; RV32-NEXT: sltu a0, s1, a1 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: mv a3, a4 ; RV32-NEXT: beqz a0, .LBB15_1 ; RV32-NEXT: .LBB15_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB15_2 Depth=1 -; RV32-NEXT: mv a2, s2 +; RV32-NEXT: mv a2, s1 ; RV32-NEXT: mv a3, s0 ; RV32-NEXT: j .LBB15_1 ; RV32-NEXT: .LBB15_6: # %atomicrmw.end @@ -420,41 +420,41 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a2 -; RV32-NEXT: mv s1, a0 -; RV32-NEXT: lw a4, 0(a0) -; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: mv s2, a1 +; RV32-NEXT: mv s1, a1 +; RV32-NEXT: mv s2, a0 +; RV32-NEXT: lw a1, 0(a0) +; RV32-NEXT: lw a4, 4(a0) ; RV32-NEXT: j .LBB17_2 ; RV32-NEXT: .LBB17_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB17_2 Depth=1 -; RV32-NEXT: sw a4, 8(sp) -; RV32-NEXT: sw a5, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a4, 12(sp) ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 -; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a0, s2 ; RV32-NEXT: call __atomic_compare_exchange_8 -; RV32-NEXT: lw a4, 8(sp) -; RV32-NEXT: lw a5, 12(sp) +; RV32-NEXT: lw a1, 8(sp) +; RV32-NEXT: lw a4, 12(sp) ; RV32-NEXT: bnez a0, .LBB17_6 ; RV32-NEXT: .LBB17_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: beq a5, s0, .LBB17_4 +; RV32-NEXT: beq a4, s0, .LBB17_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB17_2 Depth=1 -; RV32-NEXT: sltu a0, s0, a5 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: mv a3, a5 +; RV32-NEXT: sltu a0, s0, a4 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: mv a3, a4 ; RV32-NEXT: beqz a0, .LBB17_1 ; RV32-NEXT: j .LBB17_5 ; RV32-NEXT: .LBB17_4: # in Loop: Header=BB17_2 Depth=1 -; RV32-NEXT: sltu a0, s2, a4 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: mv a3, a5 +; RV32-NEXT: sltu a0, s1, a1 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: mv a3, a4 ; RV32-NEXT: beqz a0, .LBB17_1 ; RV32-NEXT: .LBB17_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB17_2 Depth=1 -; RV32-NEXT: mv a2, s2 +; RV32-NEXT: mv a2, s1 ; RV32-NEXT: mv a3, s0 ; RV32-NEXT: j .LBB17_1 ; RV32-NEXT: .LBB17_6: # %atomicrmw.end diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll index 81518541477a8..95cd49ff9611d 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll @@ -5352,34 +5352,34 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 -; RV32I-NEXT: srai s2, a0, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai s2, a1, 24 ; RV32I-NEXT: j .LBB45_2 ; RV32I-NEXT: .LBB45_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB45_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB45_4 ; RV32I-NEXT: .LBB45_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s2, a0, .LBB45_1 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s2, a1, .LBB45_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB45_1 ; RV32I-NEXT: .LBB45_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -5423,34 +5423,34 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 -; RV64I-NEXT: srai s2, a0, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai s2, a1, 56 ; RV64I-NEXT: j .LBB45_2 ; RV64I-NEXT: .LBB45_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB45_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB45_4 ; RV64I-NEXT: .LBB45_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a0, .LBB45_1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a1, .LBB45_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB45_1 ; RV64I-NEXT: .LBB45_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -5537,34 +5537,34 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 -; RV32I-NEXT: srai s2, a0, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai s2, a1, 24 ; RV32I-NEXT: j .LBB46_2 ; RV32I-NEXT: .LBB46_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB46_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB46_4 ; RV32I-NEXT: .LBB46_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s2, a0, .LBB46_1 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s2, a1, .LBB46_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB46_1 ; RV32I-NEXT: .LBB46_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -5637,34 +5637,34 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 -; RV64I-NEXT: srai s2, a0, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai s2, a1, 56 ; RV64I-NEXT: j .LBB46_2 ; RV64I-NEXT: .LBB46_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB46_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB46_4 ; RV64I-NEXT: .LBB46_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a0, .LBB46_1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a1, .LBB46_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB46_1 ; RV64I-NEXT: .LBB46_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -5809,34 +5809,34 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 -; RV32I-NEXT: srai s2, a0, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai s2, a1, 24 ; RV32I-NEXT: j .LBB47_2 ; RV32I-NEXT: .LBB47_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB47_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB47_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB47_4 ; RV32I-NEXT: .LBB47_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s2, a0, .LBB47_1 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s2, a1, .LBB47_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB47_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB47_1 ; RV32I-NEXT: .LBB47_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -5909,34 +5909,34 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 -; RV64I-NEXT: srai s2, a0, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai s2, a1, 56 ; RV64I-NEXT: j .LBB47_2 ; RV64I-NEXT: .LBB47_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB47_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB47_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB47_4 ; RV64I-NEXT: .LBB47_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a0, .LBB47_1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a1, .LBB47_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB47_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB47_1 ; RV64I-NEXT: .LBB47_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -6081,34 +6081,34 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 -; RV32I-NEXT: srai s2, a0, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai s2, a1, 24 ; RV32I-NEXT: j .LBB48_2 ; RV32I-NEXT: .LBB48_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB48_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB48_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB48_4 ; RV32I-NEXT: .LBB48_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s2, a0, .LBB48_1 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s2, a1, .LBB48_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB48_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB48_1 ; RV32I-NEXT: .LBB48_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -6181,34 +6181,34 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 -; RV64I-NEXT: srai s2, a0, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai s2, a1, 56 ; RV64I-NEXT: j .LBB48_2 ; RV64I-NEXT: .LBB48_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB48_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB48_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB48_4 ; RV64I-NEXT: .LBB48_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a0, .LBB48_1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a1, .LBB48_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB48_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB48_1 ; RV64I-NEXT: .LBB48_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -6353,34 +6353,34 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 -; RV32I-NEXT: srai s2, a0, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai s2, a1, 24 ; RV32I-NEXT: j .LBB49_2 ; RV32I-NEXT: .LBB49_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB49_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB49_4 ; RV32I-NEXT: .LBB49_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s2, a0, .LBB49_1 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s2, a1, .LBB49_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB49_1 ; RV32I-NEXT: .LBB49_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -6424,34 +6424,34 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 -; RV64I-NEXT: srai s2, a0, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai s2, a1, 56 ; RV64I-NEXT: j .LBB49_2 ; RV64I-NEXT: .LBB49_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB49_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB49_4 ; RV64I-NEXT: .LBB49_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a0, .LBB49_1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a1, .LBB49_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB49_1 ; RV64I-NEXT: .LBB49_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -6538,34 +6538,34 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 -; RV32I-NEXT: srai s2, a0, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai s2, a1, 24 ; RV32I-NEXT: j .LBB50_2 ; RV32I-NEXT: .LBB50_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB50_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB50_4 ; RV32I-NEXT: .LBB50_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s2, a0, .LBB50_1 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s2, a1, .LBB50_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB50_1 ; RV32I-NEXT: .LBB50_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -6609,34 +6609,34 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 -; RV64I-NEXT: srai s2, a0, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai s2, a1, 56 ; RV64I-NEXT: j .LBB50_2 ; RV64I-NEXT: .LBB50_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB50_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB50_4 ; RV64I-NEXT: .LBB50_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a0, .LBB50_1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a1, .LBB50_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB50_1 ; RV64I-NEXT: .LBB50_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -6723,34 +6723,34 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 -; RV32I-NEXT: srai s2, a0, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai s2, a1, 24 ; RV32I-NEXT: j .LBB51_2 ; RV32I-NEXT: .LBB51_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB51_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB51_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB51_4 ; RV32I-NEXT: .LBB51_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s2, a0, .LBB51_1 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s2, a1, .LBB51_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB51_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB51_1 ; RV32I-NEXT: .LBB51_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -6823,34 +6823,34 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 -; RV64I-NEXT: srai s2, a0, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai s2, a1, 56 ; RV64I-NEXT: j .LBB51_2 ; RV64I-NEXT: .LBB51_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB51_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB51_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB51_4 ; RV64I-NEXT: .LBB51_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a0, .LBB51_1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a1, .LBB51_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB51_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB51_1 ; RV64I-NEXT: .LBB51_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -6995,34 +6995,34 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 -; RV32I-NEXT: srai s2, a0, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai s2, a1, 24 ; RV32I-NEXT: j .LBB52_2 ; RV32I-NEXT: .LBB52_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB52_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB52_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB52_4 ; RV32I-NEXT: .LBB52_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s2, a0, .LBB52_1 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s2, a1, .LBB52_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB52_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB52_1 ; RV32I-NEXT: .LBB52_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -7095,34 +7095,34 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 -; RV64I-NEXT: srai s2, a0, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai s2, a1, 56 ; RV64I-NEXT: j .LBB52_2 ; RV64I-NEXT: .LBB52_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB52_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB52_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB52_4 ; RV64I-NEXT: .LBB52_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a0, .LBB52_1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a1, .LBB52_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB52_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB52_1 ; RV64I-NEXT: .LBB52_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -7267,34 +7267,34 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 -; RV32I-NEXT: srai s2, a0, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai s2, a1, 24 ; RV32I-NEXT: j .LBB53_2 ; RV32I-NEXT: .LBB53_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB53_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB53_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB53_4 ; RV32I-NEXT: .LBB53_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s2, a0, .LBB53_1 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s2, a1, .LBB53_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB53_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB53_1 ; RV32I-NEXT: .LBB53_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -7367,34 +7367,34 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 -; RV64I-NEXT: srai s2, a0, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai s2, a1, 56 ; RV64I-NEXT: j .LBB53_2 ; RV64I-NEXT: .LBB53_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB53_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB53_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB53_4 ; RV64I-NEXT: .LBB53_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a0, .LBB53_1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a1, .LBB53_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB53_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB53_1 ; RV64I-NEXT: .LBB53_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -7539,34 +7539,34 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 -; RV32I-NEXT: srai s2, a0, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: srai s2, a1, 24 ; RV32I-NEXT: j .LBB54_2 ; RV32I-NEXT: .LBB54_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB54_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB54_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB54_4 ; RV32I-NEXT: .LBB54_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s2, a0, .LBB54_1 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a1, a1, 24 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s2, a1, .LBB54_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB54_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB54_1 ; RV32I-NEXT: .LBB54_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -7610,34 +7610,34 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 -; RV64I-NEXT: srai s2, a0, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 56 +; RV64I-NEXT: srai s2, a1, 56 ; RV64I-NEXT: j .LBB54_2 ; RV64I-NEXT: .LBB54_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB54_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB54_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB54_4 ; RV64I-NEXT: .LBB54_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 -; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a0, .LBB54_1 +; RV64I-NEXT: slli a1, a0, 56 +; RV64I-NEXT: srai a1, a1, 56 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a1, .LBB54_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB54_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB54_1 ; RV64I-NEXT: .LBB54_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -7724,32 +7724,32 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB55_2 ; RV32I-NEXT: .LBB55_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB55_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB55_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB55_4 ; RV32I-NEXT: .LBB55_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s2, a0, .LBB55_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s2, a1, .LBB55_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB55_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB55_1 ; RV32I-NEXT: .LBB55_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -7788,32 +7788,32 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB55_2 ; RV64I-NEXT: .LBB55_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB55_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB55_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB55_4 ; RV64I-NEXT: .LBB55_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a0, .LBB55_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a1, .LBB55_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB55_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB55_1 ; RV64I-NEXT: .LBB55_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -7890,32 +7890,32 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB56_2 ; RV32I-NEXT: .LBB56_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB56_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB56_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB56_4 ; RV32I-NEXT: .LBB56_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s2, a0, .LBB56_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s2, a1, .LBB56_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB56_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB56_1 ; RV32I-NEXT: .LBB56_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -7978,32 +7978,32 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB56_2 ; RV64I-NEXT: .LBB56_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB56_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB56_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB56_4 ; RV64I-NEXT: .LBB56_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a0, .LBB56_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a1, .LBB56_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB56_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB56_1 ; RV64I-NEXT: .LBB56_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -8128,32 +8128,32 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB57_2 ; RV32I-NEXT: .LBB57_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB57_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB57_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB57_4 ; RV32I-NEXT: .LBB57_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s2, a0, .LBB57_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s2, a1, .LBB57_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB57_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB57_1 ; RV32I-NEXT: .LBB57_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -8216,32 +8216,32 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB57_2 ; RV64I-NEXT: .LBB57_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB57_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB57_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB57_4 ; RV64I-NEXT: .LBB57_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a0, .LBB57_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a1, .LBB57_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB57_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB57_1 ; RV64I-NEXT: .LBB57_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -8366,32 +8366,32 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB58_2 ; RV32I-NEXT: .LBB58_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB58_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB58_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB58_4 ; RV32I-NEXT: .LBB58_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s2, a0, .LBB58_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s2, a1, .LBB58_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB58_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB58_1 ; RV32I-NEXT: .LBB58_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -8454,32 +8454,32 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB58_2 ; RV64I-NEXT: .LBB58_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB58_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB58_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB58_4 ; RV64I-NEXT: .LBB58_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a0, .LBB58_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a1, .LBB58_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB58_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB58_1 ; RV64I-NEXT: .LBB58_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -8604,32 +8604,32 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB59_2 ; RV32I-NEXT: .LBB59_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB59_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB59_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB59_4 ; RV32I-NEXT: .LBB59_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s2, a0, .LBB59_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s2, a1, .LBB59_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB59_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB59_1 ; RV32I-NEXT: .LBB59_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -8668,32 +8668,32 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB59_2 ; RV64I-NEXT: .LBB59_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB59_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB59_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB59_4 ; RV64I-NEXT: .LBB59_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a0, .LBB59_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a1, .LBB59_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB59_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB59_1 ; RV64I-NEXT: .LBB59_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -8770,32 +8770,32 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB60_2 ; RV32I-NEXT: .LBB60_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB60_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB60_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB60_4 ; RV32I-NEXT: .LBB60_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s2, a0, .LBB60_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s2, a1, .LBB60_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB60_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB60_1 ; RV32I-NEXT: .LBB60_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -8834,32 +8834,32 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB60_2 ; RV64I-NEXT: .LBB60_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB60_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB60_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB60_4 ; RV64I-NEXT: .LBB60_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a0, .LBB60_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a1, .LBB60_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB60_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB60_1 ; RV64I-NEXT: .LBB60_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -8936,32 +8936,32 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB61_2 ; RV32I-NEXT: .LBB61_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB61_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB61_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB61_4 ; RV32I-NEXT: .LBB61_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s2, a0, .LBB61_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s2, a1, .LBB61_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB61_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB61_1 ; RV32I-NEXT: .LBB61_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -9024,32 +9024,32 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB61_2 ; RV64I-NEXT: .LBB61_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB61_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB61_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB61_4 ; RV64I-NEXT: .LBB61_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a0, .LBB61_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a1, .LBB61_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB61_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB61_1 ; RV64I-NEXT: .LBB61_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -9174,32 +9174,32 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB62_2 ; RV32I-NEXT: .LBB62_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB62_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB62_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB62_4 ; RV32I-NEXT: .LBB62_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s2, a0, .LBB62_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s2, a1, .LBB62_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB62_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB62_1 ; RV32I-NEXT: .LBB62_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -9262,32 +9262,32 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB62_2 ; RV64I-NEXT: .LBB62_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB62_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB62_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB62_4 ; RV64I-NEXT: .LBB62_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a0, .LBB62_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a1, .LBB62_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB62_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB62_1 ; RV64I-NEXT: .LBB62_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -9412,32 +9412,32 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB63_2 ; RV32I-NEXT: .LBB63_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB63_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB63_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB63_4 ; RV32I-NEXT: .LBB63_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s2, a0, .LBB63_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s2, a1, .LBB63_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB63_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB63_1 ; RV32I-NEXT: .LBB63_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -9500,32 +9500,32 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB63_2 ; RV64I-NEXT: .LBB63_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB63_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB63_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB63_4 ; RV64I-NEXT: .LBB63_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a0, .LBB63_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a1, .LBB63_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB63_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB63_1 ; RV64I-NEXT: .LBB63_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -9650,32 +9650,32 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB64_2 ; RV32I-NEXT: .LBB64_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB64_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB64_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB64_4 ; RV32I-NEXT: .LBB64_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s2, a0, .LBB64_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s2, a1, .LBB64_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB64_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB64_1 ; RV32I-NEXT: .LBB64_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -9714,32 +9714,32 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB64_2 ; RV64I-NEXT: .LBB64_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB64_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB64_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB64_4 ; RV64I-NEXT: .LBB64_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a0, .LBB64_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a1, .LBB64_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB64_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB64_1 ; RV64I-NEXT: .LBB64_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -15381,34 +15381,34 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: srai s2, a0, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai s2, a1, 16 ; RV32I-NEXT: j .LBB110_2 ; RV32I-NEXT: .LBB110_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB110_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: bnez a0, .LBB110_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: bnez a1, .LBB110_4 ; RV32I-NEXT: .LBB110_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s2, a0, .LBB110_1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s2, a1, .LBB110_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB110_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB110_1 ; RV32I-NEXT: .LBB110_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -15454,34 +15454,34 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 -; RV64I-NEXT: srai s2, a0, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai s2, a1, 48 ; RV64I-NEXT: j .LBB110_2 ; RV64I-NEXT: .LBB110_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB110_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: bnez a0, .LBB110_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: bnez a1, .LBB110_4 ; RV64I-NEXT: .LBB110_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a0, .LBB110_1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a1, .LBB110_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB110_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB110_1 ; RV64I-NEXT: .LBB110_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -15572,34 +15572,34 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: srai s2, a0, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai s2, a1, 16 ; RV32I-NEXT: j .LBB111_2 ; RV32I-NEXT: .LBB111_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB111_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: bnez a0, .LBB111_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: bnez a1, .LBB111_4 ; RV32I-NEXT: .LBB111_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s2, a0, .LBB111_1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s2, a1, .LBB111_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB111_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB111_1 ; RV32I-NEXT: .LBB111_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -15676,34 +15676,34 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 -; RV64I-NEXT: srai s2, a0, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai s2, a1, 48 ; RV64I-NEXT: j .LBB111_2 ; RV64I-NEXT: .LBB111_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB111_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: bnez a0, .LBB111_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: bnez a1, .LBB111_4 ; RV64I-NEXT: .LBB111_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a0, .LBB111_1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a1, .LBB111_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB111_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB111_1 ; RV64I-NEXT: .LBB111_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -15856,34 +15856,34 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: srai s2, a0, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai s2, a1, 16 ; RV32I-NEXT: j .LBB112_2 ; RV32I-NEXT: .LBB112_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB112_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: li a3, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: bnez a0, .LBB112_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: bnez a1, .LBB112_4 ; RV32I-NEXT: .LBB112_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s2, a0, .LBB112_1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s2, a1, .LBB112_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB112_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB112_1 ; RV32I-NEXT: .LBB112_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -15960,34 +15960,34 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 -; RV64I-NEXT: srai s2, a0, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai s2, a1, 48 ; RV64I-NEXT: j .LBB112_2 ; RV64I-NEXT: .LBB112_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB112_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: bnez a0, .LBB112_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: bnez a1, .LBB112_4 ; RV64I-NEXT: .LBB112_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a0, .LBB112_1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a1, .LBB112_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB112_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB112_1 ; RV64I-NEXT: .LBB112_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -16140,34 +16140,34 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: srai s2, a0, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai s2, a1, 16 ; RV32I-NEXT: j .LBB113_2 ; RV32I-NEXT: .LBB113_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB113_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: bnez a0, .LBB113_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: bnez a1, .LBB113_4 ; RV32I-NEXT: .LBB113_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s2, a0, .LBB113_1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s2, a1, .LBB113_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB113_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB113_1 ; RV32I-NEXT: .LBB113_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -16244,34 +16244,34 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 -; RV64I-NEXT: srai s2, a0, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai s2, a1, 48 ; RV64I-NEXT: j .LBB113_2 ; RV64I-NEXT: .LBB113_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB113_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: bnez a0, .LBB113_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: bnez a1, .LBB113_4 ; RV64I-NEXT: .LBB113_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a0, .LBB113_1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a1, .LBB113_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB113_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB113_1 ; RV64I-NEXT: .LBB113_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -16424,34 +16424,34 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: srai s2, a0, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai s2, a1, 16 ; RV32I-NEXT: j .LBB114_2 ; RV32I-NEXT: .LBB114_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB114_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: bnez a0, .LBB114_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: bnez a1, .LBB114_4 ; RV32I-NEXT: .LBB114_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s2, a0, .LBB114_1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s2, a1, .LBB114_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB114_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB114_1 ; RV32I-NEXT: .LBB114_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -16497,34 +16497,34 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 -; RV64I-NEXT: srai s2, a0, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai s2, a1, 48 ; RV64I-NEXT: j .LBB114_2 ; RV64I-NEXT: .LBB114_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB114_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: bnez a0, .LBB114_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: bnez a1, .LBB114_4 ; RV64I-NEXT: .LBB114_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a0, .LBB114_1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a1, .LBB114_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB114_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB114_1 ; RV64I-NEXT: .LBB114_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -16615,34 +16615,34 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: srai s2, a0, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai s2, a1, 16 ; RV32I-NEXT: j .LBB115_2 ; RV32I-NEXT: .LBB115_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB115_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: bnez a0, .LBB115_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: bnez a1, .LBB115_4 ; RV32I-NEXT: .LBB115_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s2, a0, .LBB115_1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s2, a1, .LBB115_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB115_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB115_1 ; RV32I-NEXT: .LBB115_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -16688,34 +16688,34 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 -; RV64I-NEXT: srai s2, a0, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai s2, a1, 48 ; RV64I-NEXT: j .LBB115_2 ; RV64I-NEXT: .LBB115_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB115_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: bnez a0, .LBB115_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: bnez a1, .LBB115_4 ; RV64I-NEXT: .LBB115_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a0, .LBB115_1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a1, .LBB115_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB115_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB115_1 ; RV64I-NEXT: .LBB115_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -16806,34 +16806,34 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: srai s2, a0, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai s2, a1, 16 ; RV32I-NEXT: j .LBB116_2 ; RV32I-NEXT: .LBB116_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB116_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: bnez a0, .LBB116_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: bnez a1, .LBB116_4 ; RV32I-NEXT: .LBB116_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s2, a0, .LBB116_1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s2, a1, .LBB116_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB116_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB116_1 ; RV32I-NEXT: .LBB116_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -16910,34 +16910,34 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 -; RV64I-NEXT: srai s2, a0, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai s2, a1, 48 ; RV64I-NEXT: j .LBB116_2 ; RV64I-NEXT: .LBB116_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB116_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: bnez a0, .LBB116_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: bnez a1, .LBB116_4 ; RV64I-NEXT: .LBB116_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a0, .LBB116_1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a1, .LBB116_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB116_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB116_1 ; RV64I-NEXT: .LBB116_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -17090,34 +17090,34 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: srai s2, a0, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai s2, a1, 16 ; RV32I-NEXT: j .LBB117_2 ; RV32I-NEXT: .LBB117_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB117_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: li a3, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: bnez a0, .LBB117_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: bnez a1, .LBB117_4 ; RV32I-NEXT: .LBB117_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s2, a0, .LBB117_1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s2, a1, .LBB117_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB117_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB117_1 ; RV32I-NEXT: .LBB117_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -17194,34 +17194,34 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 -; RV64I-NEXT: srai s2, a0, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai s2, a1, 48 ; RV64I-NEXT: j .LBB117_2 ; RV64I-NEXT: .LBB117_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB117_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: bnez a0, .LBB117_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: bnez a1, .LBB117_4 ; RV64I-NEXT: .LBB117_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a0, .LBB117_1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a1, .LBB117_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB117_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB117_1 ; RV64I-NEXT: .LBB117_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -17374,34 +17374,34 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: srai s2, a0, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai s2, a1, 16 ; RV32I-NEXT: j .LBB118_2 ; RV32I-NEXT: .LBB118_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB118_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: bnez a0, .LBB118_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: bnez a1, .LBB118_4 ; RV32I-NEXT: .LBB118_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s2, a0, .LBB118_1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s2, a1, .LBB118_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB118_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB118_1 ; RV32I-NEXT: .LBB118_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -17478,34 +17478,34 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 -; RV64I-NEXT: srai s2, a0, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai s2, a1, 48 ; RV64I-NEXT: j .LBB118_2 ; RV64I-NEXT: .LBB118_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB118_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: bnez a0, .LBB118_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: bnez a1, .LBB118_4 ; RV64I-NEXT: .LBB118_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a0, .LBB118_1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a1, .LBB118_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB118_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB118_1 ; RV64I-NEXT: .LBB118_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -17658,34 +17658,34 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 -; RV32I-NEXT: srai s2, a0, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a0, 0(a0) +; RV32I-NEXT: slli a1, a1, 16 +; RV32I-NEXT: srai s2, a1, 16 ; RV32I-NEXT: j .LBB119_2 ; RV32I-NEXT: .LBB119_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB119_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: bnez a0, .LBB119_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: bnez a1, .LBB119_4 ; RV32I-NEXT: .LBB119_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s2, a0, .LBB119_1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a1, a1, 16 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s2, a1, .LBB119_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB119_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB119_1 ; RV32I-NEXT: .LBB119_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -17731,34 +17731,34 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 -; RV64I-NEXT: srai s2, a0, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a0, 0(a0) +; RV64I-NEXT: slli a1, a1, 48 +; RV64I-NEXT: srai s2, a1, 48 ; RV64I-NEXT: j .LBB119_2 ; RV64I-NEXT: .LBB119_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB119_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: bnez a0, .LBB119_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: bnez a1, .LBB119_4 ; RV64I-NEXT: .LBB119_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 -; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a0, .LBB119_1 +; RV64I-NEXT: slli a1, a0, 48 +; RV64I-NEXT: srai a1, a1, 48 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a1, .LBB119_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB119_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB119_1 ; RV64I-NEXT: .LBB119_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -17852,32 +17852,32 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB120_2 ; RV32I-NEXT: .LBB120_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB120_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB120_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB120_4 ; RV32I-NEXT: .LBB120_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s3, a0, .LBB120_1 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s3, a1, .LBB120_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB120_2 Depth=1 ; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB120_1 ; RV32I-NEXT: .LBB120_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -17921,32 +17921,32 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB120_2 ; RV64I-NEXT: .LBB120_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB120_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB120_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB120_4 ; RV64I-NEXT: .LBB120_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bltu s3, a0, .LBB120_1 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s3, a1, .LBB120_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB120_2 Depth=1 ; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB120_1 ; RV64I-NEXT: .LBB120_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -18029,32 +18029,32 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB121_2 ; RV32I-NEXT: .LBB121_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB121_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB121_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB121_4 ; RV32I-NEXT: .LBB121_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s3, a0, .LBB121_1 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s3, a1, .LBB121_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB121_2 Depth=1 ; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB121_1 ; RV32I-NEXT: .LBB121_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -18123,32 +18123,32 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB121_2 ; RV64I-NEXT: .LBB121_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB121_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB121_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB121_4 ; RV64I-NEXT: .LBB121_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bltu s3, a0, .LBB121_1 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s3, a1, .LBB121_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB121_2 Depth=1 ; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB121_1 ; RV64I-NEXT: .LBB121_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -18281,32 +18281,32 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB122_2 ; RV32I-NEXT: .LBB122_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB122_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: li a3, 3 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB122_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB122_4 ; RV32I-NEXT: .LBB122_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s3, a0, .LBB122_1 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s3, a1, .LBB122_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB122_2 Depth=1 ; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB122_1 ; RV32I-NEXT: .LBB122_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -18375,32 +18375,32 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB122_2 ; RV64I-NEXT: .LBB122_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB122_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: li a3, 3 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB122_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB122_4 ; RV64I-NEXT: .LBB122_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bltu s3, a0, .LBB122_1 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s3, a1, .LBB122_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB122_2 Depth=1 ; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB122_1 ; RV64I-NEXT: .LBB122_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -18533,32 +18533,32 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB123_2 ; RV32I-NEXT: .LBB123_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB123_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB123_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB123_4 ; RV32I-NEXT: .LBB123_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s3, a0, .LBB123_1 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s3, a1, .LBB123_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB123_2 Depth=1 ; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB123_1 ; RV32I-NEXT: .LBB123_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -18627,32 +18627,32 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB123_2 ; RV64I-NEXT: .LBB123_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB123_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB123_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB123_4 ; RV64I-NEXT: .LBB123_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bltu s3, a0, .LBB123_1 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s3, a1, .LBB123_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB123_2 Depth=1 ; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB123_1 ; RV64I-NEXT: .LBB123_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -18785,32 +18785,32 @@ define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB124_2 ; RV32I-NEXT: .LBB124_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB124_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB124_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB124_4 ; RV32I-NEXT: .LBB124_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s3, a0, .LBB124_1 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s3, a1, .LBB124_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB124_2 Depth=1 ; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB124_1 ; RV32I-NEXT: .LBB124_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -18854,32 +18854,32 @@ define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB124_2 ; RV64I-NEXT: .LBB124_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB124_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB124_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB124_4 ; RV64I-NEXT: .LBB124_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bltu s3, a0, .LBB124_1 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s3, a1, .LBB124_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB124_2 Depth=1 ; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB124_1 ; RV64I-NEXT: .LBB124_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -18962,32 +18962,32 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB125_2 ; RV32I-NEXT: .LBB125_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB125_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB125_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB125_4 ; RV32I-NEXT: .LBB125_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s3, a0, .LBB125_1 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s3, a1, .LBB125_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB125_2 Depth=1 ; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB125_1 ; RV32I-NEXT: .LBB125_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19031,32 +19031,32 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB125_2 ; RV64I-NEXT: .LBB125_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB125_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB125_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB125_4 ; RV64I-NEXT: .LBB125_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bgeu s3, a0, .LBB125_1 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s3, a1, .LBB125_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB125_2 Depth=1 ; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB125_1 ; RV64I-NEXT: .LBB125_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -19139,32 +19139,32 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB126_2 ; RV32I-NEXT: .LBB126_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB126_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB126_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB126_4 ; RV32I-NEXT: .LBB126_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s3, a0, .LBB126_1 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s3, a1, .LBB126_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB126_2 Depth=1 ; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB126_1 ; RV32I-NEXT: .LBB126_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19233,32 +19233,32 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB126_2 ; RV64I-NEXT: .LBB126_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB126_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB126_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB126_4 ; RV64I-NEXT: .LBB126_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bgeu s3, a0, .LBB126_1 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s3, a1, .LBB126_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB126_2 Depth=1 ; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB126_1 ; RV64I-NEXT: .LBB126_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -19391,32 +19391,32 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB127_2 ; RV32I-NEXT: .LBB127_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB127_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: li a3, 3 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB127_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB127_4 ; RV32I-NEXT: .LBB127_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s3, a0, .LBB127_1 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s3, a1, .LBB127_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB127_2 Depth=1 ; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB127_1 ; RV32I-NEXT: .LBB127_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19485,32 +19485,32 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB127_2 ; RV64I-NEXT: .LBB127_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB127_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: li a3, 3 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB127_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB127_4 ; RV64I-NEXT: .LBB127_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bgeu s3, a0, .LBB127_1 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s3, a1, .LBB127_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB127_2 Depth=1 ; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB127_1 ; RV64I-NEXT: .LBB127_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -19643,32 +19643,32 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB128_2 ; RV32I-NEXT: .LBB128_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB128_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB128_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB128_4 ; RV32I-NEXT: .LBB128_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s3, a0, .LBB128_1 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s3, a1, .LBB128_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB128_2 Depth=1 ; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB128_1 ; RV32I-NEXT: .LBB128_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19737,32 +19737,32 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB128_2 ; RV64I-NEXT: .LBB128_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB128_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB128_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB128_4 ; RV64I-NEXT: .LBB128_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bgeu s3, a0, .LBB128_1 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s3, a1, .LBB128_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB128_2 Depth=1 ; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB128_1 ; RV64I-NEXT: .LBB128_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -19895,32 +19895,32 @@ define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB129_2 ; RV32I-NEXT: .LBB129_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB129_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB129_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB129_4 ; RV32I-NEXT: .LBB129_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s3, a0, .LBB129_1 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s3, a1, .LBB129_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB129_2 Depth=1 ; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB129_1 ; RV32I-NEXT: .LBB129_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -19964,32 +19964,32 @@ define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB129_2 ; RV64I-NEXT: .LBB129_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB129_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB129_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB129_4 ; RV64I-NEXT: .LBB129_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bgeu s3, a0, .LBB129_1 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s3, a1, .LBB129_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB129_2 Depth=1 ; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB129_1 ; RV64I-NEXT: .LBB129_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -22176,30 +22176,30 @@ define i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB165_2 ; RV32I-NEXT: .LBB165_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB165_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB165_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB165_4 ; RV32I-NEXT: .LBB165_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s1, a3, .LBB165_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s0, a0, .LBB165_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB165_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB165_1 ; RV32I-NEXT: .LBB165_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -22218,31 +22218,31 @@ define i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB165_2 ; RV64I-NEXT: .LBB165_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB165_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB165_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB165_4 ; RV64I-NEXT: .LBB165_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a3, .LBB165_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a0, .LBB165_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB165_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB165_1 ; RV64I-NEXT: .LBB165_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -22265,30 +22265,30 @@ define i32 @atomicrmw_max_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB166_2 ; RV32I-NEXT: .LBB166_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB166_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB166_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB166_4 ; RV32I-NEXT: .LBB166_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s1, a3, .LBB166_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s0, a0, .LBB166_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB166_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB166_1 ; RV32I-NEXT: .LBB166_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -22312,31 +22312,31 @@ define i32 @atomicrmw_max_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB166_2 ; RV64I-NEXT: .LBB166_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB166_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB166_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB166_4 ; RV64I-NEXT: .LBB166_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a3, .LBB166_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a0, .LBB166_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB166_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB166_1 ; RV64I-NEXT: .LBB166_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -22364,30 +22364,30 @@ define i32 @atomicrmw_max_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB167_2 ; RV32I-NEXT: .LBB167_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB167_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB167_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB167_4 ; RV32I-NEXT: .LBB167_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s1, a3, .LBB167_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s0, a0, .LBB167_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB167_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB167_1 ; RV32I-NEXT: .LBB167_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -22411,31 +22411,31 @@ define i32 @atomicrmw_max_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB167_2 ; RV64I-NEXT: .LBB167_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB167_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB167_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB167_4 ; RV64I-NEXT: .LBB167_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a3, .LBB167_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a0, .LBB167_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB167_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB167_1 ; RV64I-NEXT: .LBB167_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -22463,30 +22463,30 @@ define i32 @atomicrmw_max_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB168_2 ; RV32I-NEXT: .LBB168_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB168_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB168_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB168_4 ; RV32I-NEXT: .LBB168_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s1, a3, .LBB168_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s0, a0, .LBB168_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB168_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB168_1 ; RV32I-NEXT: .LBB168_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -22510,31 +22510,31 @@ define i32 @atomicrmw_max_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB168_2 ; RV64I-NEXT: .LBB168_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB168_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB168_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB168_4 ; RV64I-NEXT: .LBB168_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a3, .LBB168_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a0, .LBB168_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB168_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB168_1 ; RV64I-NEXT: .LBB168_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -22562,30 +22562,30 @@ define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB169_2 ; RV32I-NEXT: .LBB169_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB169_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB169_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB169_4 ; RV32I-NEXT: .LBB169_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s1, a3, .LBB169_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s0, a0, .LBB169_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB169_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB169_1 ; RV32I-NEXT: .LBB169_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -22609,31 +22609,31 @@ define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB169_2 ; RV64I-NEXT: .LBB169_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB169_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB169_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB169_4 ; RV64I-NEXT: .LBB169_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a3, .LBB169_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a0, .LBB169_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB169_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB169_1 ; RV64I-NEXT: .LBB169_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -22661,30 +22661,30 @@ define i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB170_2 ; RV32I-NEXT: .LBB170_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB170_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB170_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB170_4 ; RV32I-NEXT: .LBB170_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s1, a3, .LBB170_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s0, a0, .LBB170_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB170_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB170_1 ; RV32I-NEXT: .LBB170_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -22703,31 +22703,31 @@ define i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB170_2 ; RV64I-NEXT: .LBB170_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB170_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB170_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB170_4 ; RV64I-NEXT: .LBB170_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a3, .LBB170_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a0, .LBB170_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB170_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB170_1 ; RV64I-NEXT: .LBB170_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -22750,30 +22750,30 @@ define i32 @atomicrmw_min_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB171_2 ; RV32I-NEXT: .LBB171_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB171_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB171_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB171_4 ; RV32I-NEXT: .LBB171_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s1, a3, .LBB171_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s0, a0, .LBB171_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB171_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB171_1 ; RV32I-NEXT: .LBB171_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -22797,31 +22797,31 @@ define i32 @atomicrmw_min_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB171_2 ; RV64I-NEXT: .LBB171_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB171_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB171_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB171_4 ; RV64I-NEXT: .LBB171_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a3, .LBB171_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a0, .LBB171_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB171_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB171_1 ; RV64I-NEXT: .LBB171_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -22849,30 +22849,30 @@ define i32 @atomicrmw_min_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB172_2 ; RV32I-NEXT: .LBB172_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB172_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB172_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB172_4 ; RV32I-NEXT: .LBB172_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s1, a3, .LBB172_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s0, a0, .LBB172_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB172_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB172_1 ; RV32I-NEXT: .LBB172_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -22896,31 +22896,31 @@ define i32 @atomicrmw_min_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB172_2 ; RV64I-NEXT: .LBB172_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB172_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB172_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB172_4 ; RV64I-NEXT: .LBB172_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a3, .LBB172_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a0, .LBB172_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB172_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB172_1 ; RV64I-NEXT: .LBB172_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -22948,30 +22948,30 @@ define i32 @atomicrmw_min_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB173_2 ; RV32I-NEXT: .LBB173_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB173_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB173_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB173_4 ; RV32I-NEXT: .LBB173_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s1, a3, .LBB173_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s0, a0, .LBB173_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB173_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB173_1 ; RV32I-NEXT: .LBB173_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -22995,31 +22995,31 @@ define i32 @atomicrmw_min_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB173_2 ; RV64I-NEXT: .LBB173_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB173_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB173_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB173_4 ; RV64I-NEXT: .LBB173_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a3, .LBB173_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a0, .LBB173_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB173_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB173_1 ; RV64I-NEXT: .LBB173_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -23047,30 +23047,30 @@ define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB174_2 ; RV32I-NEXT: .LBB174_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB174_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB174_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB174_4 ; RV32I-NEXT: .LBB174_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s1, a3, .LBB174_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s0, a0, .LBB174_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB174_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB174_1 ; RV32I-NEXT: .LBB174_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -23094,31 +23094,31 @@ define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB174_2 ; RV64I-NEXT: .LBB174_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB174_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB174_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB174_4 ; RV64I-NEXT: .LBB174_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a3, .LBB174_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a0, .LBB174_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB174_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB174_1 ; RV64I-NEXT: .LBB174_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -23146,30 +23146,30 @@ define i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB175_2 ; RV32I-NEXT: .LBB175_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB175_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB175_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB175_4 ; RV32I-NEXT: .LBB175_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s1, a3, .LBB175_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s0, a0, .LBB175_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB175_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB175_1 ; RV32I-NEXT: .LBB175_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -23188,31 +23188,31 @@ define i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB175_2 ; RV64I-NEXT: .LBB175_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB175_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB175_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB175_4 ; RV64I-NEXT: .LBB175_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a3, .LBB175_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a0, .LBB175_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB175_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB175_1 ; RV64I-NEXT: .LBB175_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -23235,30 +23235,30 @@ define i32 @atomicrmw_umax_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB176_2 ; RV32I-NEXT: .LBB176_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB176_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB176_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB176_4 ; RV32I-NEXT: .LBB176_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s1, a3, .LBB176_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s0, a0, .LBB176_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB176_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB176_1 ; RV32I-NEXT: .LBB176_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -23282,31 +23282,31 @@ define i32 @atomicrmw_umax_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB176_2 ; RV64I-NEXT: .LBB176_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB176_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB176_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB176_4 ; RV64I-NEXT: .LBB176_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a3, .LBB176_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a0, .LBB176_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB176_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB176_1 ; RV64I-NEXT: .LBB176_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -23334,30 +23334,30 @@ define i32 @atomicrmw_umax_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB177_2 ; RV32I-NEXT: .LBB177_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB177_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB177_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB177_4 ; RV32I-NEXT: .LBB177_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s1, a3, .LBB177_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s0, a0, .LBB177_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB177_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB177_1 ; RV32I-NEXT: .LBB177_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -23381,31 +23381,31 @@ define i32 @atomicrmw_umax_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB177_2 ; RV64I-NEXT: .LBB177_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB177_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB177_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB177_4 ; RV64I-NEXT: .LBB177_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a3, .LBB177_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a0, .LBB177_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB177_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB177_1 ; RV64I-NEXT: .LBB177_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -23433,30 +23433,30 @@ define i32 @atomicrmw_umax_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB178_2 ; RV32I-NEXT: .LBB178_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB178_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB178_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB178_4 ; RV32I-NEXT: .LBB178_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s1, a3, .LBB178_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s0, a0, .LBB178_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB178_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB178_1 ; RV32I-NEXT: .LBB178_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -23480,31 +23480,31 @@ define i32 @atomicrmw_umax_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB178_2 ; RV64I-NEXT: .LBB178_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB178_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB178_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB178_4 ; RV64I-NEXT: .LBB178_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a3, .LBB178_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a0, .LBB178_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB178_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB178_1 ; RV64I-NEXT: .LBB178_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -23532,30 +23532,30 @@ define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB179_2 ; RV32I-NEXT: .LBB179_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB179_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB179_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB179_4 ; RV32I-NEXT: .LBB179_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s1, a3, .LBB179_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s0, a0, .LBB179_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB179_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB179_1 ; RV32I-NEXT: .LBB179_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -23579,31 +23579,31 @@ define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB179_2 ; RV64I-NEXT: .LBB179_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB179_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB179_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB179_4 ; RV64I-NEXT: .LBB179_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a3, .LBB179_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a0, .LBB179_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB179_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB179_1 ; RV64I-NEXT: .LBB179_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -23631,30 +23631,30 @@ define i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB180_2 ; RV32I-NEXT: .LBB180_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB180_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB180_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB180_4 ; RV32I-NEXT: .LBB180_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s1, a3, .LBB180_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s0, a0, .LBB180_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB180_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB180_1 ; RV32I-NEXT: .LBB180_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -23673,31 +23673,31 @@ define i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB180_2 ; RV64I-NEXT: .LBB180_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB180_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB180_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB180_4 ; RV64I-NEXT: .LBB180_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a3, .LBB180_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a0, .LBB180_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB180_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB180_1 ; RV64I-NEXT: .LBB180_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -23720,30 +23720,30 @@ define i32 @atomicrmw_umin_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB181_2 ; RV32I-NEXT: .LBB181_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB181_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 2 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB181_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB181_4 ; RV32I-NEXT: .LBB181_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s1, a3, .LBB181_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s0, a0, .LBB181_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB181_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB181_1 ; RV32I-NEXT: .LBB181_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -23767,31 +23767,31 @@ define i32 @atomicrmw_umin_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB181_2 ; RV64I-NEXT: .LBB181_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB181_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB181_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB181_4 ; RV64I-NEXT: .LBB181_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a3, .LBB181_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a0, .LBB181_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB181_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB181_1 ; RV64I-NEXT: .LBB181_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -23819,30 +23819,30 @@ define i32 @atomicrmw_umin_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB182_2 ; RV32I-NEXT: .LBB182_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB182_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 3 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB182_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB182_4 ; RV32I-NEXT: .LBB182_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s1, a3, .LBB182_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s0, a0, .LBB182_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB182_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB182_1 ; RV32I-NEXT: .LBB182_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -23866,31 +23866,31 @@ define i32 @atomicrmw_umin_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB182_2 ; RV64I-NEXT: .LBB182_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB182_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB182_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB182_4 ; RV64I-NEXT: .LBB182_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a3, .LBB182_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a0, .LBB182_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB182_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB182_1 ; RV64I-NEXT: .LBB182_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -23918,30 +23918,30 @@ define i32 @atomicrmw_umin_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB183_2 ; RV32I-NEXT: .LBB183_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB183_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 4 ; RV32I-NEXT: li a4, 2 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB183_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB183_4 ; RV32I-NEXT: .LBB183_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s1, a3, .LBB183_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s0, a0, .LBB183_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB183_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB183_1 ; RV32I-NEXT: .LBB183_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -23965,31 +23965,31 @@ define i32 @atomicrmw_umin_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB183_2 ; RV64I-NEXT: .LBB183_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB183_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB183_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB183_4 ; RV64I-NEXT: .LBB183_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a3, .LBB183_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a0, .LBB183_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB183_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB183_1 ; RV64I-NEXT: .LBB183_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -24017,30 +24017,30 @@ define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB184_2 ; RV32I-NEXT: .LBB184_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB184_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB184_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB184_4 ; RV32I-NEXT: .LBB184_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s1, a3, .LBB184_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s0, a0, .LBB184_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB184_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB184_1 ; RV32I-NEXT: .LBB184_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -24064,31 +24064,31 @@ define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB184_2 ; RV64I-NEXT: .LBB184_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB184_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB184_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB184_4 ; RV64I-NEXT: .LBB184_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a3, .LBB184_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a0, .LBB184_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB184_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB184_1 ; RV64I-NEXT: .LBB184_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -26073,45 +26073,44 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB220_2 ; RV32I-NEXT: .LBB220_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB220_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB220_7 ; RV32I-NEXT: .LBB220_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB220_4 +; RV32I-NEXT: beq a1, s0, .LBB220_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB220_5 ; RV32I-NEXT: .LBB220_4: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB220_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB220_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB220_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB220_1 ; RV32I-NEXT: .LBB220_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26127,45 +26126,44 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB220_2 ; RV32IA-NEXT: .LBB220_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB220_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB220_7 ; RV32IA-NEXT: .LBB220_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB220_4 +; RV32IA-NEXT: beq a1, s0, .LBB220_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB220_5 ; RV32IA-NEXT: .LBB220_4: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB220_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB220_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB220_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB220_1 ; RV32IA-NEXT: .LBB220_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26179,30 +26177,30 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB220_2 ; RV64I-NEXT: .LBB220_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB220_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB220_4 ; RV64I-NEXT: .LBB220_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s1, a3, .LBB220_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s0, a0, .LBB220_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB220_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB220_1 ; RV64I-NEXT: .LBB220_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -26226,45 +26224,44 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB221_2 ; RV32I-NEXT: .LBB221_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB221_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB221_7 ; RV32I-NEXT: .LBB221_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB221_4 +; RV32I-NEXT: beq a1, s0, .LBB221_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB221_5 ; RV32I-NEXT: .LBB221_4: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB221_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB221_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB221_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB221_1 ; RV32I-NEXT: .LBB221_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26280,45 +26277,44 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB221_2 ; RV32IA-NEXT: .LBB221_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB221_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB221_7 ; RV32IA-NEXT: .LBB221_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB221_4 +; RV32IA-NEXT: beq a1, s0, .LBB221_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB221_5 ; RV32IA-NEXT: .LBB221_4: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB221_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB221_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB221_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB221_1 ; RV32IA-NEXT: .LBB221_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26332,30 +26328,30 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB221_2 ; RV64I-NEXT: .LBB221_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB221_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB221_4 ; RV64I-NEXT: .LBB221_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s1, a3, .LBB221_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s0, a0, .LBB221_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB221_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB221_1 ; RV64I-NEXT: .LBB221_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -26384,45 +26380,44 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB222_2 ; RV32I-NEXT: .LBB222_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB222_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB222_7 ; RV32I-NEXT: .LBB222_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB222_4 +; RV32I-NEXT: beq a1, s0, .LBB222_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB222_5 ; RV32I-NEXT: .LBB222_4: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB222_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB222_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB222_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB222_1 ; RV32I-NEXT: .LBB222_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26438,45 +26433,44 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB222_2 ; RV32IA-NEXT: .LBB222_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB222_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB222_7 ; RV32IA-NEXT: .LBB222_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB222_4 +; RV32IA-NEXT: beq a1, s0, .LBB222_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB222_5 ; RV32IA-NEXT: .LBB222_4: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB222_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB222_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB222_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB222_1 ; RV32IA-NEXT: .LBB222_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26490,30 +26484,30 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB222_2 ; RV64I-NEXT: .LBB222_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB222_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB222_4 ; RV64I-NEXT: .LBB222_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s1, a3, .LBB222_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s0, a0, .LBB222_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB222_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB222_1 ; RV64I-NEXT: .LBB222_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -26542,45 +26536,44 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB223_2 ; RV32I-NEXT: .LBB223_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB223_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB223_7 ; RV32I-NEXT: .LBB223_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB223_4 +; RV32I-NEXT: beq a1, s0, .LBB223_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB223_5 ; RV32I-NEXT: .LBB223_4: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB223_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB223_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB223_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB223_1 ; RV32I-NEXT: .LBB223_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26596,45 +26589,44 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB223_2 ; RV32IA-NEXT: .LBB223_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB223_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB223_7 ; RV32IA-NEXT: .LBB223_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB223_4 +; RV32IA-NEXT: beq a1, s0, .LBB223_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB223_5 ; RV32IA-NEXT: .LBB223_4: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB223_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB223_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB223_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB223_1 ; RV32IA-NEXT: .LBB223_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26648,30 +26640,30 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB223_2 ; RV64I-NEXT: .LBB223_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB223_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB223_4 ; RV64I-NEXT: .LBB223_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s1, a3, .LBB223_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s0, a0, .LBB223_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB223_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB223_1 ; RV64I-NEXT: .LBB223_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -26700,45 +26692,44 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB224_2 ; RV32I-NEXT: .LBB224_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB224_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB224_7 ; RV32I-NEXT: .LBB224_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB224_4 +; RV32I-NEXT: beq a1, s0, .LBB224_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB224_5 ; RV32I-NEXT: .LBB224_4: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB224_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB224_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB224_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB224_1 ; RV32I-NEXT: .LBB224_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26754,45 +26745,44 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB224_2 ; RV32IA-NEXT: .LBB224_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB224_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB224_7 ; RV32IA-NEXT: .LBB224_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB224_4 +; RV32IA-NEXT: beq a1, s0, .LBB224_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB224_5 ; RV32IA-NEXT: .LBB224_4: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB224_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB224_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB224_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB224_1 ; RV32IA-NEXT: .LBB224_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26806,30 +26796,30 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB224_2 ; RV64I-NEXT: .LBB224_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB224_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB224_4 ; RV64I-NEXT: .LBB224_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s1, a3, .LBB224_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s0, a0, .LBB224_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB224_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB224_1 ; RV64I-NEXT: .LBB224_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -26858,45 +26848,44 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB225_2 ; RV32I-NEXT: .LBB225_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB225_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB225_7 ; RV32I-NEXT: .LBB225_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB225_4 +; RV32I-NEXT: beq a1, s0, .LBB225_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB225_5 ; RV32I-NEXT: .LBB225_4: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB225_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB225_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB225_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB225_1 ; RV32I-NEXT: .LBB225_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26912,45 +26901,44 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB225_2 ; RV32IA-NEXT: .LBB225_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB225_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB225_7 ; RV32IA-NEXT: .LBB225_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB225_4 +; RV32IA-NEXT: beq a1, s0, .LBB225_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB225_5 ; RV32IA-NEXT: .LBB225_4: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB225_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB225_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB225_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB225_1 ; RV32IA-NEXT: .LBB225_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -26964,30 +26952,30 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB225_2 ; RV64I-NEXT: .LBB225_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB225_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB225_4 ; RV64I-NEXT: .LBB225_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s1, a3, .LBB225_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s0, a0, .LBB225_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB225_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB225_1 ; RV64I-NEXT: .LBB225_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -27011,45 +26999,44 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB226_2 ; RV32I-NEXT: .LBB226_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB226_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB226_7 ; RV32I-NEXT: .LBB226_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB226_4 +; RV32I-NEXT: beq a1, s0, .LBB226_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB226_5 ; RV32I-NEXT: .LBB226_4: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB226_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB226_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB226_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB226_1 ; RV32I-NEXT: .LBB226_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27065,45 +27052,44 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB226_2 ; RV32IA-NEXT: .LBB226_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB226_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB226_7 ; RV32IA-NEXT: .LBB226_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB226_4 +; RV32IA-NEXT: beq a1, s0, .LBB226_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB226_5 ; RV32IA-NEXT: .LBB226_4: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB226_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB226_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB226_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB226_1 ; RV32IA-NEXT: .LBB226_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27117,30 +27103,30 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB226_2 ; RV64I-NEXT: .LBB226_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB226_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB226_4 ; RV64I-NEXT: .LBB226_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s1, a3, .LBB226_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s0, a0, .LBB226_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB226_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB226_1 ; RV64I-NEXT: .LBB226_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -27169,45 +27155,44 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB227_2 ; RV32I-NEXT: .LBB227_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB227_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB227_7 ; RV32I-NEXT: .LBB227_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB227_4 +; RV32I-NEXT: beq a1, s0, .LBB227_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB227_5 ; RV32I-NEXT: .LBB227_4: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB227_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB227_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB227_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB227_1 ; RV32I-NEXT: .LBB227_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27223,45 +27208,44 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB227_2 ; RV32IA-NEXT: .LBB227_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB227_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB227_7 ; RV32IA-NEXT: .LBB227_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB227_4 +; RV32IA-NEXT: beq a1, s0, .LBB227_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB227_5 ; RV32IA-NEXT: .LBB227_4: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB227_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB227_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB227_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB227_1 ; RV32IA-NEXT: .LBB227_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27275,30 +27259,30 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB227_2 ; RV64I-NEXT: .LBB227_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB227_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB227_4 ; RV64I-NEXT: .LBB227_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s1, a3, .LBB227_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s0, a0, .LBB227_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB227_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB227_1 ; RV64I-NEXT: .LBB227_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -27327,45 +27311,44 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB228_2 ; RV32I-NEXT: .LBB228_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB228_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB228_7 ; RV32I-NEXT: .LBB228_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB228_4 +; RV32I-NEXT: beq a1, s0, .LBB228_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB228_5 ; RV32I-NEXT: .LBB228_4: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB228_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB228_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB228_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB228_1 ; RV32I-NEXT: .LBB228_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27381,45 +27364,44 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB228_2 ; RV32IA-NEXT: .LBB228_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB228_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB228_7 ; RV32IA-NEXT: .LBB228_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB228_4 +; RV32IA-NEXT: beq a1, s0, .LBB228_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB228_5 ; RV32IA-NEXT: .LBB228_4: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB228_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB228_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB228_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB228_1 ; RV32IA-NEXT: .LBB228_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27433,30 +27415,30 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB228_2 ; RV64I-NEXT: .LBB228_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB228_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB228_4 ; RV64I-NEXT: .LBB228_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s1, a3, .LBB228_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s0, a0, .LBB228_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB228_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB228_1 ; RV64I-NEXT: .LBB228_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -27485,45 +27467,44 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB229_2 ; RV32I-NEXT: .LBB229_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB229_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB229_7 ; RV32I-NEXT: .LBB229_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB229_4 +; RV32I-NEXT: beq a1, s0, .LBB229_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB229_5 ; RV32I-NEXT: .LBB229_4: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB229_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB229_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB229_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB229_1 ; RV32I-NEXT: .LBB229_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27539,45 +27520,44 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB229_2 ; RV32IA-NEXT: .LBB229_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB229_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB229_7 ; RV32IA-NEXT: .LBB229_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB229_4 +; RV32IA-NEXT: beq a1, s0, .LBB229_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB229_5 ; RV32IA-NEXT: .LBB229_4: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB229_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB229_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB229_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB229_1 ; RV32IA-NEXT: .LBB229_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27591,30 +27571,30 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB229_2 ; RV64I-NEXT: .LBB229_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB229_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB229_4 ; RV64I-NEXT: .LBB229_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s1, a3, .LBB229_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s0, a0, .LBB229_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB229_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB229_1 ; RV64I-NEXT: .LBB229_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -27643,45 +27623,44 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB230_2 ; RV32I-NEXT: .LBB230_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB230_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB230_7 ; RV32I-NEXT: .LBB230_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB230_4 +; RV32I-NEXT: beq a1, s0, .LBB230_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB230_5 ; RV32I-NEXT: .LBB230_4: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB230_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB230_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB230_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB230_1 ; RV32I-NEXT: .LBB230_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27697,45 +27676,44 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB230_2 ; RV32IA-NEXT: .LBB230_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB230_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB230_7 ; RV32IA-NEXT: .LBB230_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB230_4 +; RV32IA-NEXT: beq a1, s0, .LBB230_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB230_5 ; RV32IA-NEXT: .LBB230_4: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB230_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB230_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB230_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB230_1 ; RV32IA-NEXT: .LBB230_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27749,30 +27727,30 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB230_2 ; RV64I-NEXT: .LBB230_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB230_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB230_4 ; RV64I-NEXT: .LBB230_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s1, a3, .LBB230_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s0, a0, .LBB230_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB230_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB230_1 ; RV64I-NEXT: .LBB230_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -27796,45 +27774,44 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB231_2 ; RV32I-NEXT: .LBB231_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB231_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB231_7 ; RV32I-NEXT: .LBB231_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB231_4 +; RV32I-NEXT: beq a1, s0, .LBB231_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB231_5 ; RV32I-NEXT: .LBB231_4: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB231_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB231_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB231_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB231_1 ; RV32I-NEXT: .LBB231_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27850,45 +27827,44 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB231_2 ; RV32IA-NEXT: .LBB231_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB231_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB231_7 ; RV32IA-NEXT: .LBB231_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB231_4 +; RV32IA-NEXT: beq a1, s0, .LBB231_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB231_5 ; RV32IA-NEXT: .LBB231_4: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB231_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB231_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB231_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB231_1 ; RV32IA-NEXT: .LBB231_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -27902,30 +27878,30 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB231_2 ; RV64I-NEXT: .LBB231_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB231_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB231_4 ; RV64I-NEXT: .LBB231_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s1, a3, .LBB231_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s0, a0, .LBB231_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB231_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB231_1 ; RV64I-NEXT: .LBB231_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -27954,45 +27930,44 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB232_2 ; RV32I-NEXT: .LBB232_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB232_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB232_7 ; RV32I-NEXT: .LBB232_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB232_4 +; RV32I-NEXT: beq a1, s0, .LBB232_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB232_5 ; RV32I-NEXT: .LBB232_4: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB232_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB232_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB232_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB232_1 ; RV32I-NEXT: .LBB232_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28008,45 +27983,44 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB232_2 ; RV32IA-NEXT: .LBB232_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB232_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB232_7 ; RV32IA-NEXT: .LBB232_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB232_4 +; RV32IA-NEXT: beq a1, s0, .LBB232_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB232_5 ; RV32IA-NEXT: .LBB232_4: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB232_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB232_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB232_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB232_1 ; RV32IA-NEXT: .LBB232_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28060,30 +28034,30 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB232_2 ; RV64I-NEXT: .LBB232_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB232_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB232_4 ; RV64I-NEXT: .LBB232_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s1, a3, .LBB232_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s0, a0, .LBB232_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB232_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB232_1 ; RV64I-NEXT: .LBB232_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -28112,45 +28086,44 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB233_2 ; RV32I-NEXT: .LBB233_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB233_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB233_7 ; RV32I-NEXT: .LBB233_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB233_4 +; RV32I-NEXT: beq a1, s0, .LBB233_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB233_5 ; RV32I-NEXT: .LBB233_4: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB233_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB233_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB233_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB233_1 ; RV32I-NEXT: .LBB233_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28166,45 +28139,44 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB233_2 ; RV32IA-NEXT: .LBB233_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB233_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB233_7 ; RV32IA-NEXT: .LBB233_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB233_4 +; RV32IA-NEXT: beq a1, s0, .LBB233_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB233_5 ; RV32IA-NEXT: .LBB233_4: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB233_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB233_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB233_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB233_1 ; RV32IA-NEXT: .LBB233_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28218,30 +28190,30 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB233_2 ; RV64I-NEXT: .LBB233_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB233_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB233_4 ; RV64I-NEXT: .LBB233_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s1, a3, .LBB233_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s0, a0, .LBB233_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB233_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB233_1 ; RV64I-NEXT: .LBB233_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -28270,45 +28242,44 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB234_2 ; RV32I-NEXT: .LBB234_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB234_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB234_7 ; RV32I-NEXT: .LBB234_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB234_4 +; RV32I-NEXT: beq a1, s0, .LBB234_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB234_5 ; RV32I-NEXT: .LBB234_4: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB234_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB234_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB234_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB234_1 ; RV32I-NEXT: .LBB234_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28324,45 +28295,44 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB234_2 ; RV32IA-NEXT: .LBB234_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB234_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB234_7 ; RV32IA-NEXT: .LBB234_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB234_4 +; RV32IA-NEXT: beq a1, s0, .LBB234_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB234_5 ; RV32IA-NEXT: .LBB234_4: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB234_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB234_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB234_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB234_1 ; RV32IA-NEXT: .LBB234_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28376,30 +28346,30 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB234_2 ; RV64I-NEXT: .LBB234_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB234_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB234_4 ; RV64I-NEXT: .LBB234_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s1, a3, .LBB234_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s0, a0, .LBB234_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB234_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB234_1 ; RV64I-NEXT: .LBB234_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -28428,45 +28398,44 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB235_2 ; RV32I-NEXT: .LBB235_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB235_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB235_7 ; RV32I-NEXT: .LBB235_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB235_4 +; RV32I-NEXT: beq a1, s0, .LBB235_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB235_5 ; RV32I-NEXT: .LBB235_4: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB235_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB235_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB235_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB235_1 ; RV32I-NEXT: .LBB235_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28482,45 +28451,44 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB235_2 ; RV32IA-NEXT: .LBB235_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB235_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB235_7 ; RV32IA-NEXT: .LBB235_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB235_4 +; RV32IA-NEXT: beq a1, s0, .LBB235_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB235_5 ; RV32IA-NEXT: .LBB235_4: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB235_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB235_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB235_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB235_1 ; RV32IA-NEXT: .LBB235_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28534,30 +28502,30 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB235_2 ; RV64I-NEXT: .LBB235_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB235_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB235_4 ; RV64I-NEXT: .LBB235_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s1, a3, .LBB235_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s0, a0, .LBB235_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB235_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB235_1 ; RV64I-NEXT: .LBB235_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -28581,45 +28549,44 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB236_2 ; RV32I-NEXT: .LBB236_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 2 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB236_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB236_7 ; RV32I-NEXT: .LBB236_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB236_4 +; RV32I-NEXT: beq a1, s0, .LBB236_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB236_5 ; RV32I-NEXT: .LBB236_4: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB236_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB236_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB236_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB236_1 ; RV32I-NEXT: .LBB236_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28635,45 +28602,44 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB236_2 ; RV32IA-NEXT: .LBB236_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 2 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB236_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB236_7 ; RV32IA-NEXT: .LBB236_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB236_4 +; RV32IA-NEXT: beq a1, s0, .LBB236_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB236_5 ; RV32IA-NEXT: .LBB236_4: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB236_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB236_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB236_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB236_1 ; RV32IA-NEXT: .LBB236_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28687,30 +28653,30 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB236_2 ; RV64I-NEXT: .LBB236_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 2 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB236_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB236_4 ; RV64I-NEXT: .LBB236_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s1, a3, .LBB236_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s0, a0, .LBB236_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB236_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB236_1 ; RV64I-NEXT: .LBB236_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -28739,45 +28705,44 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB237_2 ; RV32I-NEXT: .LBB237_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 3 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB237_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB237_7 ; RV32I-NEXT: .LBB237_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB237_4 +; RV32I-NEXT: beq a1, s0, .LBB237_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB237_5 ; RV32I-NEXT: .LBB237_4: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB237_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB237_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB237_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB237_1 ; RV32I-NEXT: .LBB237_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28793,45 +28758,44 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB237_2 ; RV32IA-NEXT: .LBB237_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 3 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB237_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB237_7 ; RV32IA-NEXT: .LBB237_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB237_4 +; RV32IA-NEXT: beq a1, s0, .LBB237_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB237_5 ; RV32IA-NEXT: .LBB237_4: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB237_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB237_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB237_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB237_1 ; RV32IA-NEXT: .LBB237_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28845,30 +28809,30 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB237_2 ; RV64I-NEXT: .LBB237_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 3 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB237_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB237_4 ; RV64I-NEXT: .LBB237_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s1, a3, .LBB237_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s0, a0, .LBB237_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB237_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB237_1 ; RV64I-NEXT: .LBB237_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -28897,45 +28861,44 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB238_2 ; RV32I-NEXT: .LBB238_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 4 ; RV32I-NEXT: li a5, 2 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB238_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB238_7 ; RV32I-NEXT: .LBB238_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB238_4 +; RV32I-NEXT: beq a1, s0, .LBB238_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB238_5 ; RV32I-NEXT: .LBB238_4: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB238_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB238_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB238_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB238_1 ; RV32I-NEXT: .LBB238_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -28951,45 +28914,44 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB238_2 ; RV32IA-NEXT: .LBB238_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 4 ; RV32IA-NEXT: li a5, 2 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB238_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB238_7 ; RV32IA-NEXT: .LBB238_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB238_4 +; RV32IA-NEXT: beq a1, s0, .LBB238_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB238_5 ; RV32IA-NEXT: .LBB238_4: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB238_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB238_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB238_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB238_1 ; RV32IA-NEXT: .LBB238_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -29003,30 +28965,30 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB238_2 ; RV64I-NEXT: .LBB238_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 2 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB238_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB238_4 ; RV64I-NEXT: .LBB238_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s1, a3, .LBB238_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s0, a0, .LBB238_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB238_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB238_1 ; RV64I-NEXT: .LBB238_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -29055,45 +29017,44 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB239_2 ; RV32I-NEXT: .LBB239_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB239_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB239_7 ; RV32I-NEXT: .LBB239_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB239_4 +; RV32I-NEXT: beq a1, s0, .LBB239_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB239_5 ; RV32I-NEXT: .LBB239_4: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB239_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB239_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB239_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB239_1 ; RV32I-NEXT: .LBB239_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -29109,45 +29070,44 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB239_2 ; RV32IA-NEXT: .LBB239_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB239_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB239_7 ; RV32IA-NEXT: .LBB239_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB239_4 +; RV32IA-NEXT: beq a1, s0, .LBB239_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB239_5 ; RV32IA-NEXT: .LBB239_4: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB239_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB239_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB239_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB239_1 ; RV32IA-NEXT: .LBB239_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -29161,30 +29121,30 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB239_2 ; RV64I-NEXT: .LBB239_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB239_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB239_4 ; RV64I-NEXT: .LBB239_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s1, a3, .LBB239_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s0, a0, .LBB239_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB239_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB239_1 ; RV64I-NEXT: .LBB239_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll index aea7473ceece4..81c47f8701c50 100644 --- a/llvm/test/CodeGen/RISCV/atomic-signext.ll +++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll @@ -586,34 +586,34 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: slli a0, s0, 24 ; RV32I-NEXT: srai s2, a0, 24 ; RV32I-NEXT: j .LBB10_2 ; RV32I-NEXT: .LBB10_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB10_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a1, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) +; RV32I-NEXT: lbu a1, 15(sp) ; RV32I-NEXT: bnez a0, .LBB10_4 ; RV32I-NEXT: .LBB10_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 +; RV32I-NEXT: slli a0, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: mv a2, a1 ; RV32I-NEXT: blt s2, a0, .LBB10_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB10_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB10_1 ; RV32I-NEXT: .LBB10_4: # %atomicrmw.end -; RV32I-NEXT: slli a0, a3, 24 +; RV32I-NEXT: slli a0, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -660,34 +660,34 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: slli a0, s0, 56 ; RV64I-NEXT: srai s2, a0, 56 ; RV64I-NEXT: j .LBB10_2 ; RV64I-NEXT: .LBB10_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB10_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a1, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) +; RV64I-NEXT: lbu a1, 15(sp) ; RV64I-NEXT: bnez a0, .LBB10_4 ; RV64I-NEXT: .LBB10_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 +; RV64I-NEXT: slli a0, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: mv a2, a1 ; RV64I-NEXT: blt s2, a0, .LBB10_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB10_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB10_1 ; RV64I-NEXT: .LBB10_4: # %atomicrmw.end -; RV64I-NEXT: slli a0, a3, 56 +; RV64I-NEXT: slli a0, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -738,34 +738,34 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 24 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: slli a0, s0, 24 ; RV32I-NEXT: srai s2, a0, 24 ; RV32I-NEXT: j .LBB11_2 ; RV32I-NEXT: .LBB11_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB11_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a1, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) +; RV32I-NEXT: lbu a1, 15(sp) ; RV32I-NEXT: bnez a0, .LBB11_4 ; RV32I-NEXT: .LBB11_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 24 +; RV32I-NEXT: slli a0, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: mv a2, a1 ; RV32I-NEXT: bge s2, a0, .LBB11_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB11_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB11_1 ; RV32I-NEXT: .LBB11_4: # %atomicrmw.end -; RV32I-NEXT: slli a0, a3, 24 +; RV32I-NEXT: slli a0, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -812,34 +812,34 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 56 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: slli a0, s0, 56 ; RV64I-NEXT: srai s2, a0, 56 ; RV64I-NEXT: j .LBB11_2 ; RV64I-NEXT: .LBB11_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB11_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a1, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) +; RV64I-NEXT: lbu a1, 15(sp) ; RV64I-NEXT: bnez a0, .LBB11_4 ; RV64I-NEXT: .LBB11_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 56 +; RV64I-NEXT: slli a0, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 -; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: mv a2, a1 ; RV64I-NEXT: bge s2, a0, .LBB11_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB11_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB11_1 ; RV64I-NEXT: .LBB11_4: # %atomicrmw.end -; RV64I-NEXT: slli a0, a3, 56 +; RV64I-NEXT: slli a0, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -890,32 +890,32 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: andi s2, a1, 255 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: andi s2, s0, 255 ; RV32I-NEXT: j .LBB12_2 ; RV32I-NEXT: .LBB12_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB12_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a1, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) +; RV32I-NEXT: lbu a1, 15(sp) ; RV32I-NEXT: bnez a0, .LBB12_4 ; RV32I-NEXT: .LBB12_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: andi a0, a1, 255 +; RV32I-NEXT: mv a2, a1 ; RV32I-NEXT: bltu s2, a0, .LBB12_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB12_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB12_1 ; RV32I-NEXT: .LBB12_4: # %atomicrmw.end -; RV32I-NEXT: slli a0, a3, 24 +; RV32I-NEXT: slli a0, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -957,32 +957,32 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: andi s2, a1, 255 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: andi s2, s0, 255 ; RV64I-NEXT: j .LBB12_2 ; RV64I-NEXT: .LBB12_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB12_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a1, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) +; RV64I-NEXT: lbu a1, 15(sp) ; RV64I-NEXT: bnez a0, .LBB12_4 ; RV64I-NEXT: .LBB12_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: andi a0, a1, 255 +; RV64I-NEXT: mv a2, a1 ; RV64I-NEXT: bltu s2, a0, .LBB12_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB12_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB12_1 ; RV64I-NEXT: .LBB12_4: # %atomicrmw.end -; RV64I-NEXT: slli a0, a3, 56 +; RV64I-NEXT: slli a0, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -1028,32 +1028,32 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: andi s2, a1, 255 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: andi s2, s0, 255 ; RV32I-NEXT: j .LBB13_2 ; RV32I-NEXT: .LBB13_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB13_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a1, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) +; RV32I-NEXT: lbu a1, 15(sp) ; RV32I-NEXT: bnez a0, .LBB13_4 ; RV32I-NEXT: .LBB13_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: andi a0, a1, 255 +; RV32I-NEXT: mv a2, a1 ; RV32I-NEXT: bgeu s2, a0, .LBB13_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB13_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB13_1 ; RV32I-NEXT: .LBB13_4: # %atomicrmw.end -; RV32I-NEXT: slli a0, a3, 24 +; RV32I-NEXT: slli a0, a1, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -1095,32 +1095,32 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: andi s2, a1, 255 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: andi s2, s0, 255 ; RV64I-NEXT: j .LBB13_2 ; RV64I-NEXT: .LBB13_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB13_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a1, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) +; RV64I-NEXT: lbu a1, 15(sp) ; RV64I-NEXT: bnez a0, .LBB13_4 ; RV64I-NEXT: .LBB13_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: andi a0, a1, 255 +; RV64I-NEXT: mv a2, a1 ; RV64I-NEXT: bgeu s2, a0, .LBB13_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB13_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB13_1 ; RV64I-NEXT: .LBB13_4: # %atomicrmw.end -; RV64I-NEXT: slli a0, a3, 56 +; RV64I-NEXT: slli a0, a1, 56 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -1634,34 +1634,34 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: slli a0, s0, 16 ; RV32I-NEXT: srai s2, a0, 16 ; RV32I-NEXT: j .LBB21_2 ; RV32I-NEXT: .LBB21_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB21_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a1, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) +; RV32I-NEXT: lh a1, 14(sp) ; RV32I-NEXT: bnez a0, .LBB21_4 ; RV32I-NEXT: .LBB21_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 +; RV32I-NEXT: slli a0, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: mv a2, a1 ; RV32I-NEXT: blt s2, a0, .LBB21_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB21_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB21_1 ; RV32I-NEXT: .LBB21_4: # %atomicrmw.end -; RV32I-NEXT: slli a0, a3, 16 +; RV32I-NEXT: slli a0, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -1710,34 +1710,34 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: slli a0, s0, 48 ; RV64I-NEXT: srai s2, a0, 48 ; RV64I-NEXT: j .LBB21_2 ; RV64I-NEXT: .LBB21_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB21_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a1, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) +; RV64I-NEXT: lh a1, 14(sp) ; RV64I-NEXT: bnez a0, .LBB21_4 ; RV64I-NEXT: .LBB21_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 +; RV64I-NEXT: slli a0, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: mv a2, a1 ; RV64I-NEXT: blt s2, a0, .LBB21_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB21_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB21_1 ; RV64I-NEXT: .LBB21_4: # %atomicrmw.end -; RV64I-NEXT: slli a0, a3, 48 +; RV64I-NEXT: slli a0, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -1790,34 +1790,34 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 -; RV32I-NEXT: slli a0, a1, 16 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: slli a0, s0, 16 ; RV32I-NEXT: srai s2, a0, 16 ; RV32I-NEXT: j .LBB22_2 ; RV32I-NEXT: .LBB22_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB22_2 Depth=1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: sh a1, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) +; RV32I-NEXT: lh a1, 14(sp) ; RV32I-NEXT: bnez a0, .LBB22_4 ; RV32I-NEXT: .LBB22_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: slli a0, a3, 16 +; RV32I-NEXT: slli a0, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: mv a2, a1 ; RV32I-NEXT: bge s2, a0, .LBB22_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB22_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB22_1 ; RV32I-NEXT: .LBB22_4: # %atomicrmw.end -; RV32I-NEXT: slli a0, a3, 16 +; RV32I-NEXT: slli a0, a1, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -1866,34 +1866,34 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 -; RV64I-NEXT: slli a0, a1, 48 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: slli a0, s0, 48 ; RV64I-NEXT: srai s2, a0, 48 ; RV64I-NEXT: j .LBB22_2 ; RV64I-NEXT: .LBB22_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB22_2 Depth=1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: sh a1, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) +; RV64I-NEXT: lh a1, 14(sp) ; RV64I-NEXT: bnez a0, .LBB22_4 ; RV64I-NEXT: .LBB22_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: slli a0, a3, 48 +; RV64I-NEXT: slli a0, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 -; RV64I-NEXT: mv a2, a3 +; RV64I-NEXT: mv a2, a1 ; RV64I-NEXT: bge s2, a0, .LBB22_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB22_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB22_1 ; RV64I-NEXT: .LBB22_4: # %atomicrmw.end -; RV64I-NEXT: slli a0, a3, 48 +; RV64I-NEXT: slli a0, a1, 48 ; RV64I-NEXT: srai a0, a0, 48 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -2530,30 +2530,30 @@ define signext i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB32_2 ; RV32I-NEXT: .LBB32_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB32_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB32_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB32_4 ; RV32I-NEXT: .LBB32_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: blt s1, a3, .LBB32_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt s0, a0, .LBB32_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB32_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB32_1 ; RV32I-NEXT: .LBB32_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -2572,31 +2572,31 @@ define signext i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB32_2 ; RV64I-NEXT: .LBB32_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB32_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB32_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB32_4 ; RV64I-NEXT: .LBB32_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s2, a3, .LBB32_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s2, a0, .LBB32_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB32_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB32_1 ; RV64I-NEXT: .LBB32_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -2619,30 +2619,30 @@ define signext i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB33_2 ; RV32I-NEXT: .LBB33_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB33_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB33_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB33_4 ; RV32I-NEXT: .LBB33_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bge s1, a3, .LBB33_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bge s0, a0, .LBB33_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB33_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB33_1 ; RV32I-NEXT: .LBB33_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -2661,31 +2661,31 @@ define signext i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB33_2 ; RV64I-NEXT: .LBB33_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB33_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB33_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB33_4 ; RV64I-NEXT: .LBB33_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s2, a3, .LBB33_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s2, a0, .LBB33_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB33_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB33_1 ; RV64I-NEXT: .LBB33_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -2708,30 +2708,30 @@ define signext i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB34_2 ; RV32I-NEXT: .LBB34_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB34_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB34_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB34_4 ; RV32I-NEXT: .LBB34_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bltu s1, a3, .LBB34_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu s0, a0, .LBB34_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB34_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB34_1 ; RV32I-NEXT: .LBB34_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -2750,31 +2750,31 @@ define signext i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB34_2 ; RV64I-NEXT: .LBB34_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB34_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB34_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB34_4 ; RV64I-NEXT: .LBB34_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s2, a3, .LBB34_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s2, a0, .LBB34_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB34_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB34_1 ; RV64I-NEXT: .LBB34_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -2797,30 +2797,30 @@ define signext i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB35_2 ; RV32I-NEXT: .LBB35_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB35_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB35_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB35_4 ; RV32I-NEXT: .LBB35_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a3 -; RV32I-NEXT: bgeu s1, a3, .LBB35_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgeu s0, a0, .LBB35_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB35_2 Depth=1 -; RV32I-NEXT: mv a2, s1 +; RV32I-NEXT: mv a2, s0 ; RV32I-NEXT: j .LBB35_1 ; RV32I-NEXT: .LBB35_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -2839,31 +2839,31 @@ define signext i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB35_2 ; RV64I-NEXT: .LBB35_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB35_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB35_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB35_4 ; RV64I-NEXT: .LBB35_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s2, a3, .LBB35_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s2, a0, .LBB35_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB35_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB35_1 ; RV64I-NEXT: .LBB35_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -3183,45 +3183,44 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB43_2 ; RV32I-NEXT: .LBB43_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB43_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB43_7 ; RV32I-NEXT: .LBB43_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB43_4 +; RV32I-NEXT: beq a1, s0, .LBB43_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB43_5 ; RV32I-NEXT: .LBB43_4: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB43_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB43_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB43_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB43_1 ; RV32I-NEXT: .LBB43_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3237,45 +3236,44 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB43_2 ; RV32IA-NEXT: .LBB43_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB43_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB43_7 ; RV32IA-NEXT: .LBB43_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB43_4 +; RV32IA-NEXT: beq a1, s0, .LBB43_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB43_5 ; RV32IA-NEXT: .LBB43_4: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB43_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB43_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB43_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB43_1 ; RV32IA-NEXT: .LBB43_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3289,30 +3287,30 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB43_2 ; RV64I-NEXT: .LBB43_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB43_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB43_4 ; RV64I-NEXT: .LBB43_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: blt s1, a3, .LBB43_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt s0, a0, .LBB43_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB43_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB43_1 ; RV64I-NEXT: .LBB43_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -3336,45 +3334,44 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB44_2 ; RV32I-NEXT: .LBB44_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB44_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB44_7 ; RV32I-NEXT: .LBB44_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB44_4 +; RV32I-NEXT: beq a1, s0, .LBB44_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: slt a0, s0, a5 +; RV32I-NEXT: slt a4, s0, a1 ; RV32I-NEXT: j .LBB44_5 ; RV32I-NEXT: .LBB44_4: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB44_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB44_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB44_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB44_1 ; RV32I-NEXT: .LBB44_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3390,45 +3387,44 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB44_2 ; RV32IA-NEXT: .LBB44_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB44_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB44_7 ; RV32IA-NEXT: .LBB44_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB44_4 +; RV32IA-NEXT: beq a1, s0, .LBB44_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: slt a0, s0, a5 +; RV32IA-NEXT: slt a4, s0, a1 ; RV32IA-NEXT: j .LBB44_5 ; RV32IA-NEXT: .LBB44_4: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB44_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB44_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB44_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB44_1 ; RV32IA-NEXT: .LBB44_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3442,30 +3438,30 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB44_2 ; RV64I-NEXT: .LBB44_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB44_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB44_4 ; RV64I-NEXT: .LBB44_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bge s1, a3, .LBB44_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bge s0, a0, .LBB44_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB44_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB44_1 ; RV64I-NEXT: .LBB44_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -3489,45 +3485,44 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB45_2 ; RV32I-NEXT: .LBB45_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB45_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB45_7 ; RV32I-NEXT: .LBB45_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB45_4 +; RV32I-NEXT: beq a1, s0, .LBB45_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB45_5 ; RV32I-NEXT: .LBB45_4: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB45_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: bnez a0, .LBB45_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: bnez a4, .LBB45_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB45_1 ; RV32I-NEXT: .LBB45_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3543,45 +3538,44 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB45_2 ; RV32IA-NEXT: .LBB45_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB45_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB45_7 ; RV32IA-NEXT: .LBB45_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB45_4 +; RV32IA-NEXT: beq a1, s0, .LBB45_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB45_5 ; RV32IA-NEXT: .LBB45_4: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB45_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: bnez a0, .LBB45_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: bnez a4, .LBB45_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB45_1 ; RV32IA-NEXT: .LBB45_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3595,30 +3589,30 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB45_2 ; RV64I-NEXT: .LBB45_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB45_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB45_4 ; RV64I-NEXT: .LBB45_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bltu s1, a3, .LBB45_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu s0, a0, .LBB45_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB45_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB45_1 ; RV64I-NEXT: .LBB45_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -3642,45 +3636,44 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB46_2 ; RV32I-NEXT: .LBB46_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: li a5, 0 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB46_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB46_7 ; RV32I-NEXT: .LBB46_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a5, s0, .LBB46_4 +; RV32I-NEXT: beq a1, s0, .LBB46_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a5 +; RV32I-NEXT: sltu a4, s0, a1 ; RV32I-NEXT: j .LBB46_5 ; RV32I-NEXT: .LBB46_4: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a4 +; RV32I-NEXT: sltu a4, s1, a0 ; RV32I-NEXT: .LBB46_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: mv a2, a4 -; RV32I-NEXT: mv a3, a5 -; RV32I-NEXT: beqz a0, .LBB46_1 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: mv a3, a1 +; RV32I-NEXT: beqz a4, .LBB46_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 ; RV32I-NEXT: j .LBB46_1 ; RV32I-NEXT: .LBB46_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3696,45 +3689,44 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV32IA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32IA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB46_2 ; RV32IA-NEXT: .LBB46_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: li a4, 0 ; RV32IA-NEXT: li a5, 0 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB46_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB46_7 ; RV32IA-NEXT: .LBB46_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a5, s0, .LBB46_4 +; RV32IA-NEXT: beq a1, s0, .LBB46_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a5 +; RV32IA-NEXT: sltu a4, s0, a1 ; RV32IA-NEXT: j .LBB46_5 ; RV32IA-NEXT: .LBB46_4: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a4 +; RV32IA-NEXT: sltu a4, s1, a0 ; RV32IA-NEXT: .LBB46_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: mv a2, a4 -; RV32IA-NEXT: mv a3, a5 -; RV32IA-NEXT: beqz a0, .LBB46_1 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: mv a3, a1 +; RV32IA-NEXT: beqz a4, .LBB46_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 ; RV32IA-NEXT: j .LBB46_1 ; RV32IA-NEXT: .LBB46_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -3748,30 +3740,30 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB46_2 ; RV64I-NEXT: .LBB46_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB46_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB46_4 ; RV64I-NEXT: .LBB46_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a3 -; RV64I-NEXT: bgeu s1, a3, .LBB46_1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bgeu s0, a0, .LBB46_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB46_2 Depth=1 -; RV64I-NEXT: mv a2, s1 +; RV64I-NEXT: mv a2, s0 ; RV64I-NEXT: j .LBB46_1 ; RV64I-NEXT: .LBB46_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -4298,10 +4290,10 @@ define signext i32 @atomicrmw_xchg_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV32IA-NEXT: amoswap.w a0, a1, (a0) ; RV32IA-NEXT: ret ; RV32IA-NEXT: .LBB53_2: # %else -; RV32IA-NEXT: mv a1, a0 -; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 0(a0) ; RV32IA-NEXT: li a2, 1 -; RV32IA-NEXT: sw a2, 0(a1) +; RV32IA-NEXT: sw a2, 0(a0) +; RV32IA-NEXT: mv a0, a1 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_i32_monotonic_crossbb: @@ -4334,10 +4326,10 @@ define signext i32 @atomicrmw_xchg_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64IA-NEXT: amoswap.w a0, a1, (a0) ; RV64IA-NEXT: ret ; RV64IA-NEXT: .LBB53_2: # %else -; RV64IA-NEXT: mv a1, a0 -; RV64IA-NEXT: lw a0, 0(a0) +; RV64IA-NEXT: lw a1, 0(a0) ; RV64IA-NEXT: li a2, 1 -; RV64IA-NEXT: sw a2, 0(a1) +; RV64IA-NEXT: sw a2, 0(a0) +; RV64IA-NEXT: mv a0, a1 ; RV64IA-NEXT: ret br i1 %c, label %then, label %else @@ -4385,10 +4377,10 @@ define signext i32 @atomicrmw_add_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV32IA-NEXT: amoadd.w a0, a1, (a0) ; RV32IA-NEXT: ret ; RV32IA-NEXT: .LBB54_2: # %else -; RV32IA-NEXT: mv a1, a0 -; RV32IA-NEXT: lw a0, 0(a0) -; RV32IA-NEXT: addi a2, a0, 1 -; RV32IA-NEXT: sw a2, 0(a1) +; RV32IA-NEXT: lw a1, 0(a0) +; RV32IA-NEXT: addi a2, a1, 1 +; RV32IA-NEXT: sw a2, 0(a0) +; RV32IA-NEXT: mv a0, a1 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_add_i32_monotonic_crossbb: @@ -4421,10 +4413,10 @@ define signext i32 @atomicrmw_add_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64IA-NEXT: amoadd.w a0, a1, (a0) ; RV64IA-NEXT: ret ; RV64IA-NEXT: .LBB54_2: # %else -; RV64IA-NEXT: mv a1, a0 -; RV64IA-NEXT: lw a0, 0(a0) -; RV64IA-NEXT: addi a2, a0, 1 -; RV64IA-NEXT: sw a2, 0(a1) +; RV64IA-NEXT: lw a1, 0(a0) +; RV64IA-NEXT: addi a2, a1, 1 +; RV64IA-NEXT: sw a2, 0(a0) +; RV64IA-NEXT: mv a0, a1 ; RV64IA-NEXT: ret br i1 %c, label %then, label %else @@ -4473,10 +4465,10 @@ define signext i32 @atomicrmw_sub_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV32IA-NEXT: amoadd.w a0, a1, (a0) ; RV32IA-NEXT: ret ; RV32IA-NEXT: .LBB55_2: # %else -; RV32IA-NEXT: mv a1, a0 -; RV32IA-NEXT: lw a0, 0(a0) -; RV32IA-NEXT: addi a2, a0, -1 -; RV32IA-NEXT: sw a2, 0(a1) +; RV32IA-NEXT: lw a1, 0(a0) +; RV32IA-NEXT: addi a2, a1, -1 +; RV32IA-NEXT: sw a2, 0(a0) +; RV32IA-NEXT: mv a0, a1 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_sub_i32_monotonic_crossbb: @@ -4509,10 +4501,10 @@ define signext i32 @atomicrmw_sub_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64IA-NEXT: amoadd.w a0, a1, (a0) ; RV64IA-NEXT: ret ; RV64IA-NEXT: .LBB55_2: # %else -; RV64IA-NEXT: mv a1, a0 -; RV64IA-NEXT: lw a0, 0(a0) -; RV64IA-NEXT: addi a2, a0, -1 -; RV64IA-NEXT: sw a2, 0(a1) +; RV64IA-NEXT: lw a1, 0(a0) +; RV64IA-NEXT: addi a2, a1, -1 +; RV64IA-NEXT: sw a2, 0(a0) +; RV64IA-NEXT: mv a0, a1 ; RV64IA-NEXT: ret br i1 %c, label %then, label %else @@ -4561,10 +4553,10 @@ define signext i32 @atomicrmw_and_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV32IA-NEXT: amoand.w a0, a1, (a0) ; RV32IA-NEXT: ret ; RV32IA-NEXT: .LBB56_2: # %else -; RV32IA-NEXT: mv a1, a0 -; RV32IA-NEXT: lw a0, 0(a0) -; RV32IA-NEXT: andi a2, a0, 1 -; RV32IA-NEXT: sw a2, 0(a1) +; RV32IA-NEXT: lw a1, 0(a0) +; RV32IA-NEXT: andi a2, a1, 1 +; RV32IA-NEXT: sw a2, 0(a0) +; RV32IA-NEXT: mv a0, a1 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i32_monotonic_crossbb: @@ -4597,10 +4589,10 @@ define signext i32 @atomicrmw_and_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64IA-NEXT: amoand.w a0, a1, (a0) ; RV64IA-NEXT: ret ; RV64IA-NEXT: .LBB56_2: # %else -; RV64IA-NEXT: mv a1, a0 -; RV64IA-NEXT: lw a0, 0(a0) -; RV64IA-NEXT: andi a2, a0, 1 -; RV64IA-NEXT: sw a2, 0(a1) +; RV64IA-NEXT: lw a1, 0(a0) +; RV64IA-NEXT: andi a2, a1, 1 +; RV64IA-NEXT: sw a2, 0(a0) +; RV64IA-NEXT: mv a0, a1 ; RV64IA-NEXT: ret br i1 %c, label %then, label %else @@ -4642,24 +4634,25 @@ define signext i32 @atomicrmw_nand_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; ; RV32IA-NOZACAS-LABEL: atomicrmw_nand_i32_monotonic_crossbb: ; RV32IA-NOZACAS: # %bb.0: -; RV32IA-NOZACAS-NEXT: andi a2, a1, 1 -; RV32IA-NOZACAS-NEXT: mv a1, a0 -; RV32IA-NOZACAS-NEXT: beqz a2, .LBB57_2 +; RV32IA-NOZACAS-NEXT: andi a1, a1, 1 +; RV32IA-NOZACAS-NEXT: beqz a1, .LBB57_2 ; RV32IA-NOZACAS-NEXT: # %bb.1: # %then ; RV32IA-NOZACAS-NEXT: li a2, 1 ; RV32IA-NOZACAS-NEXT: .LBB57_3: # %then ; RV32IA-NOZACAS-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NOZACAS-NEXT: lr.w a0, (a1) -; RV32IA-NOZACAS-NEXT: and a3, a0, a2 +; RV32IA-NOZACAS-NEXT: lr.w a1, (a0) +; RV32IA-NOZACAS-NEXT: and a3, a1, a2 ; RV32IA-NOZACAS-NEXT: not a3, a3 -; RV32IA-NOZACAS-NEXT: sc.w a3, a3, (a1) +; RV32IA-NOZACAS-NEXT: sc.w a3, a3, (a0) ; RV32IA-NOZACAS-NEXT: bnez a3, .LBB57_3 ; RV32IA-NOZACAS-NEXT: # %bb.4: # %then +; RV32IA-NOZACAS-NEXT: mv a0, a1 ; RV32IA-NOZACAS-NEXT: ret ; RV32IA-NOZACAS-NEXT: .LBB57_2: # %else -; RV32IA-NOZACAS-NEXT: lw a0, 0(a1) -; RV32IA-NOZACAS-NEXT: andi a2, a0, 1 -; RV32IA-NOZACAS-NEXT: sw a2, 0(a1) +; RV32IA-NOZACAS-NEXT: lw a1, 0(a0) +; RV32IA-NOZACAS-NEXT: andi a2, a1, 1 +; RV32IA-NOZACAS-NEXT: sw a2, 0(a0) +; RV32IA-NOZACAS-NEXT: mv a0, a1 ; RV32IA-NOZACAS-NEXT: ret ; ; RV32IA-ZACAS-LABEL: atomicrmw_nand_i32_monotonic_crossbb: @@ -4708,24 +4701,25 @@ define signext i32 @atomicrmw_nand_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; ; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i32_monotonic_crossbb: ; RV64IA-NOZACAS: # %bb.0: -; RV64IA-NOZACAS-NEXT: andi a2, a1, 1 -; RV64IA-NOZACAS-NEXT: mv a1, a0 -; RV64IA-NOZACAS-NEXT: beqz a2, .LBB57_2 +; RV64IA-NOZACAS-NEXT: andi a1, a1, 1 +; RV64IA-NOZACAS-NEXT: beqz a1, .LBB57_2 ; RV64IA-NOZACAS-NEXT: # %bb.1: # %then ; RV64IA-NOZACAS-NEXT: li a2, 1 ; RV64IA-NOZACAS-NEXT: .LBB57_3: # %then ; RV64IA-NOZACAS-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64IA-NOZACAS-NEXT: lr.w a0, (a1) -; RV64IA-NOZACAS-NEXT: and a3, a0, a2 +; RV64IA-NOZACAS-NEXT: lr.w a1, (a0) +; RV64IA-NOZACAS-NEXT: and a3, a1, a2 ; RV64IA-NOZACAS-NEXT: not a3, a3 -; RV64IA-NOZACAS-NEXT: sc.w a3, a3, (a1) +; RV64IA-NOZACAS-NEXT: sc.w a3, a3, (a0) ; RV64IA-NOZACAS-NEXT: bnez a3, .LBB57_3 ; RV64IA-NOZACAS-NEXT: # %bb.4: # %then +; RV64IA-NOZACAS-NEXT: mv a0, a1 ; RV64IA-NOZACAS-NEXT: ret ; RV64IA-NOZACAS-NEXT: .LBB57_2: # %else -; RV64IA-NOZACAS-NEXT: lw a0, 0(a1) -; RV64IA-NOZACAS-NEXT: andi a2, a0, 1 -; RV64IA-NOZACAS-NEXT: sw a2, 0(a1) +; RV64IA-NOZACAS-NEXT: lw a1, 0(a0) +; RV64IA-NOZACAS-NEXT: andi a2, a1, 1 +; RV64IA-NOZACAS-NEXT: sw a2, 0(a0) +; RV64IA-NOZACAS-NEXT: mv a0, a1 ; RV64IA-NOZACAS-NEXT: ret ; ; RV64IA-ZACAS-LABEL: atomicrmw_nand_i32_monotonic_crossbb: @@ -4797,10 +4791,10 @@ define signext i32 @atomicrmw_or_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind { ; RV32IA-NEXT: amoor.w a0, a1, (a0) ; RV32IA-NEXT: ret ; RV32IA-NEXT: .LBB58_2: # %else -; RV32IA-NEXT: mv a1, a0 -; RV32IA-NEXT: lw a0, 0(a0) -; RV32IA-NEXT: ori a2, a0, 1 -; RV32IA-NEXT: sw a2, 0(a1) +; RV32IA-NEXT: lw a1, 0(a0) +; RV32IA-NEXT: ori a2, a1, 1 +; RV32IA-NEXT: sw a2, 0(a0) +; RV32IA-NEXT: mv a0, a1 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i32_monotonic_crossbb: @@ -4833,10 +4827,10 @@ define signext i32 @atomicrmw_or_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind { ; RV64IA-NEXT: amoor.w a0, a1, (a0) ; RV64IA-NEXT: ret ; RV64IA-NEXT: .LBB58_2: # %else -; RV64IA-NEXT: mv a1, a0 -; RV64IA-NEXT: lw a0, 0(a0) -; RV64IA-NEXT: ori a2, a0, 1 -; RV64IA-NEXT: sw a2, 0(a1) +; RV64IA-NEXT: lw a1, 0(a0) +; RV64IA-NEXT: ori a2, a1, 1 +; RV64IA-NEXT: sw a2, 0(a0) +; RV64IA-NEXT: mv a0, a1 ; RV64IA-NEXT: ret br i1 %c, label %then, label %else @@ -4885,10 +4879,10 @@ define signext i32 @atomicrmw_xor_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV32IA-NEXT: amoxor.w a0, a1, (a0) ; RV32IA-NEXT: ret ; RV32IA-NEXT: .LBB59_2: # %else -; RV32IA-NEXT: mv a1, a0 -; RV32IA-NEXT: lw a0, 0(a0) -; RV32IA-NEXT: xori a2, a0, 1 -; RV32IA-NEXT: sw a2, 0(a1) +; RV32IA-NEXT: lw a1, 0(a0) +; RV32IA-NEXT: xori a2, a1, 1 +; RV32IA-NEXT: sw a2, 0(a0) +; RV32IA-NEXT: mv a0, a1 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i32_monotonic_crossbb: @@ -4921,10 +4915,10 @@ define signext i32 @atomicrmw_xor_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64IA-NEXT: amoxor.w a0, a1, (a0) ; RV64IA-NEXT: ret ; RV64IA-NEXT: .LBB59_2: # %else -; RV64IA-NEXT: mv a1, a0 -; RV64IA-NEXT: lw a0, 0(a0) -; RV64IA-NEXT: xori a2, a0, 1 -; RV64IA-NEXT: sw a2, 0(a1) +; RV64IA-NEXT: lw a1, 0(a0) +; RV64IA-NEXT: xori a2, a1, 1 +; RV64IA-NEXT: sw a2, 0(a0) +; RV64IA-NEXT: mv a0, a1 ; RV64IA-NEXT: ret br i1 %c, label %then, label %else @@ -4949,40 +4943,40 @@ define signext i32 @atomicrmw_max_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: andi a1, a1, 1 ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: andi a1, a1, 1 ; RV32I-NEXT: beqz a1, .LBB60_5 ; RV32I-NEXT: # %bb.1: # %then -; RV32I-NEXT: lw a1, 0(s0) +; RV32I-NEXT: lw a0, 0(s0) ; RV32I-NEXT: j .LBB60_3 ; RV32I-NEXT: .LBB60_2: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB60_3 Depth=1 -; RV32I-NEXT: sw a1, 4(sp) +; RV32I-NEXT: sw a0, 4(sp) ; RV32I-NEXT: addi a1, sp, 4 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a1, 4(sp) -; RV32I-NEXT: bnez a0, .LBB60_8 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 4(sp) +; RV32I-NEXT: bnez a1, .LBB60_8 ; RV32I-NEXT: .LBB60_3: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgtz a1, .LBB60_2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bgtz a0, .LBB60_2 ; RV32I-NEXT: # %bb.4: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB60_3 Depth=1 ; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: j .LBB60_2 ; RV32I-NEXT: .LBB60_5: # %else -; RV32I-NEXT: lw a1, 0(s0) -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: bgtz a1, .LBB60_7 +; RV32I-NEXT: lw a0, 0(s0) +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: bgtz a0, .LBB60_7 ; RV32I-NEXT: # %bb.6: # %else -; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: li a1, 1 ; RV32I-NEXT: .LBB60_7: # %else -; RV32I-NEXT: sw a0, 0(s0) +; RV32I-NEXT: sw a1, 0(s0) ; RV32I-NEXT: .LBB60_8: # %merge -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 @@ -4990,21 +4984,21 @@ define signext i32 @atomicrmw_max_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; ; RV32IA-LABEL: atomicrmw_max_i32_monotonic_crossbb: ; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a1, 1 -; RV32IA-NEXT: mv a1, a0 -; RV32IA-NEXT: beqz a2, .LBB60_2 +; RV32IA-NEXT: andi a1, a1, 1 +; RV32IA-NEXT: beqz a1, .LBB60_2 ; RV32IA-NEXT: # %bb.1: # %then -; RV32IA-NEXT: li a0, 1 -; RV32IA-NEXT: amomax.w a0, a0, (a1) +; RV32IA-NEXT: li a1, 1 +; RV32IA-NEXT: amomax.w a0, a1, (a0) ; RV32IA-NEXT: ret ; RV32IA-NEXT: .LBB60_2: # %else -; RV32IA-NEXT: lw a0, 0(a1) -; RV32IA-NEXT: mv a2, a0 -; RV32IA-NEXT: bgtz a0, .LBB60_4 +; RV32IA-NEXT: lw a1, 0(a0) +; RV32IA-NEXT: mv a2, a1 +; RV32IA-NEXT: bgtz a1, .LBB60_4 ; RV32IA-NEXT: # %bb.3: # %else ; RV32IA-NEXT: li a2, 1 ; RV32IA-NEXT: .LBB60_4: # %else -; RV32IA-NEXT: sw a2, 0(a1) +; RV32IA-NEXT: sw a2, 0(a0) +; RV32IA-NEXT: mv a0, a1 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_max_i32_monotonic_crossbb: @@ -5012,41 +5006,41 @@ define signext i32 @atomicrmw_max_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64I-NEXT: addi sp, sp, -32 ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: andi a1, a1, 1 ; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: andi a1, a1, 1 ; RV64I-NEXT: beqz a1, .LBB60_5 ; RV64I-NEXT: # %bb.1: # %then -; RV64I-NEXT: lw a1, 0(s0) +; RV64I-NEXT: lw a0, 0(s0) ; RV64I-NEXT: j .LBB60_3 ; RV64I-NEXT: .LBB60_2: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB60_3 Depth=1 -; RV64I-NEXT: sw a1, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a1, 12(sp) -; RV64I-NEXT: bnez a0, .LBB60_8 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB60_8 ; RV64I-NEXT: .LBB60_3: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt a0, a1, .LBB60_2 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt a1, a0, .LBB60_2 ; RV64I-NEXT: # %bb.4: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB60_3 Depth=1 ; RV64I-NEXT: li a2, 1 ; RV64I-NEXT: j .LBB60_2 ; RV64I-NEXT: .LBB60_5: # %else -; RV64I-NEXT: lw a1, 0(s0) -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: bgtz a1, .LBB60_7 +; RV64I-NEXT: lw a0, 0(s0) +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: bgtz a0, .LBB60_7 ; RV64I-NEXT: # %bb.6: # %else -; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: li a1, 1 ; RV64I-NEXT: .LBB60_7: # %else -; RV64I-NEXT: sw a0, 0(s0) +; RV64I-NEXT: sw a1, 0(s0) ; RV64I-NEXT: .LBB60_8: # %merge -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 32 @@ -5054,21 +5048,21 @@ define signext i32 @atomicrmw_max_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; ; RV64IA-LABEL: atomicrmw_max_i32_monotonic_crossbb: ; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a1, 1 -; RV64IA-NEXT: mv a1, a0 -; RV64IA-NEXT: beqz a2, .LBB60_2 +; RV64IA-NEXT: andi a1, a1, 1 +; RV64IA-NEXT: beqz a1, .LBB60_2 ; RV64IA-NEXT: # %bb.1: # %then -; RV64IA-NEXT: li a0, 1 -; RV64IA-NEXT: amomax.w a0, a0, (a1) +; RV64IA-NEXT: li a1, 1 +; RV64IA-NEXT: amomax.w a0, a1, (a0) ; RV64IA-NEXT: ret ; RV64IA-NEXT: .LBB60_2: # %else -; RV64IA-NEXT: lw a0, 0(a1) -; RV64IA-NEXT: mv a2, a0 -; RV64IA-NEXT: bgtz a0, .LBB60_4 +; RV64IA-NEXT: lw a1, 0(a0) +; RV64IA-NEXT: mv a2, a1 +; RV64IA-NEXT: bgtz a1, .LBB60_4 ; RV64IA-NEXT: # %bb.3: # %else ; RV64IA-NEXT: li a2, 1 ; RV64IA-NEXT: .LBB60_4: # %else -; RV64IA-NEXT: sw a2, 0(a1) +; RV64IA-NEXT: sw a2, 0(a0) +; RV64IA-NEXT: mv a0, a1 ; RV64IA-NEXT: ret br i1 %c, label %then, label %else @@ -5095,41 +5089,41 @@ define signext i32 @atomicrmw_min_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: andi a1, a1, 1 ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: andi a1, a1, 1 ; RV32I-NEXT: beqz a1, .LBB61_5 ; RV32I-NEXT: # %bb.1: # %then -; RV32I-NEXT: lw a1, 0(s0) +; RV32I-NEXT: lw a0, 0(s0) ; RV32I-NEXT: li s1, 2 ; RV32I-NEXT: j .LBB61_3 ; RV32I-NEXT: .LBB61_2: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB61_3 Depth=1 -; RV32I-NEXT: sw a1, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a1, 0(sp) -; RV32I-NEXT: bnez a0, .LBB61_8 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB61_8 ; RV32I-NEXT: .LBB61_3: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: blt a1, s1, .LBB61_2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: blt a0, s1, .LBB61_2 ; RV32I-NEXT: # %bb.4: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB61_3 Depth=1 ; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: j .LBB61_2 ; RV32I-NEXT: .LBB61_5: # %else -; RV32I-NEXT: lw a1, 0(s0) -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: blez a1, .LBB61_7 +; RV32I-NEXT: lw a0, 0(s0) +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: blez a0, .LBB61_7 ; RV32I-NEXT: # %bb.6: # %else -; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: li a1, 1 ; RV32I-NEXT: .LBB61_7: # %else -; RV32I-NEXT: sw a0, 0(s0) +; RV32I-NEXT: sw a1, 0(s0) ; RV32I-NEXT: .LBB61_8: # %merge -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -5138,21 +5132,21 @@ define signext i32 @atomicrmw_min_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; ; RV32IA-LABEL: atomicrmw_min_i32_monotonic_crossbb: ; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a1, 1 -; RV32IA-NEXT: mv a1, a0 -; RV32IA-NEXT: beqz a2, .LBB61_2 +; RV32IA-NEXT: andi a1, a1, 1 +; RV32IA-NEXT: beqz a1, .LBB61_2 ; RV32IA-NEXT: # %bb.1: # %then -; RV32IA-NEXT: li a0, 1 -; RV32IA-NEXT: amomin.w a0, a0, (a1) +; RV32IA-NEXT: li a1, 1 +; RV32IA-NEXT: amomin.w a0, a1, (a0) ; RV32IA-NEXT: ret ; RV32IA-NEXT: .LBB61_2: # %else -; RV32IA-NEXT: lw a0, 0(a1) -; RV32IA-NEXT: mv a2, a0 -; RV32IA-NEXT: blez a0, .LBB61_4 +; RV32IA-NEXT: lw a1, 0(a0) +; RV32IA-NEXT: mv a2, a1 +; RV32IA-NEXT: blez a1, .LBB61_4 ; RV32IA-NEXT: # %bb.3: # %else ; RV32IA-NEXT: li a2, 1 ; RV32IA-NEXT: .LBB61_4: # %else -; RV32IA-NEXT: sw a2, 0(a1) +; RV32IA-NEXT: sw a2, 0(a0) +; RV32IA-NEXT: mv a0, a1 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_min_i32_monotonic_crossbb: @@ -5161,41 +5155,41 @@ define signext i32 @atomicrmw_min_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: andi a1, a1, 1 ; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: andi a1, a1, 1 ; RV64I-NEXT: beqz a1, .LBB61_5 ; RV64I-NEXT: # %bb.1: # %then -; RV64I-NEXT: lw a1, 0(s0) +; RV64I-NEXT: lw a0, 0(s0) ; RV64I-NEXT: li s1, 2 ; RV64I-NEXT: j .LBB61_3 ; RV64I-NEXT: .LBB61_2: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB61_3 Depth=1 -; RV64I-NEXT: sw a1, 4(sp) +; RV64I-NEXT: sw a0, 4(sp) ; RV64I-NEXT: addi a1, sp, 4 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a1, 4(sp) -; RV64I-NEXT: bnez a0, .LBB61_8 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 4(sp) +; RV64I-NEXT: bnez a1, .LBB61_8 ; RV64I-NEXT: .LBB61_3: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: blt a1, s1, .LBB61_2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: blt a0, s1, .LBB61_2 ; RV64I-NEXT: # %bb.4: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB61_3 Depth=1 ; RV64I-NEXT: li a2, 1 ; RV64I-NEXT: j .LBB61_2 ; RV64I-NEXT: .LBB61_5: # %else -; RV64I-NEXT: lw a1, 0(s0) -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: blez a1, .LBB61_7 +; RV64I-NEXT: lw a0, 0(s0) +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: blez a0, .LBB61_7 ; RV64I-NEXT: # %bb.6: # %else -; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: li a1, 1 ; RV64I-NEXT: .LBB61_7: # %else -; RV64I-NEXT: sw a0, 0(s0) +; RV64I-NEXT: sw a1, 0(s0) ; RV64I-NEXT: .LBB61_8: # %merge -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -5204,21 +5198,21 @@ define signext i32 @atomicrmw_min_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; ; RV64IA-LABEL: atomicrmw_min_i32_monotonic_crossbb: ; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a1, 1 -; RV64IA-NEXT: mv a1, a0 -; RV64IA-NEXT: beqz a2, .LBB61_2 +; RV64IA-NEXT: andi a1, a1, 1 +; RV64IA-NEXT: beqz a1, .LBB61_2 ; RV64IA-NEXT: # %bb.1: # %then -; RV64IA-NEXT: li a0, 1 -; RV64IA-NEXT: amomin.w a0, a0, (a1) +; RV64IA-NEXT: li a1, 1 +; RV64IA-NEXT: amomin.w a0, a1, (a0) ; RV64IA-NEXT: ret ; RV64IA-NEXT: .LBB61_2: # %else -; RV64IA-NEXT: lw a0, 0(a1) -; RV64IA-NEXT: mv a2, a0 -; RV64IA-NEXT: blez a0, .LBB61_4 +; RV64IA-NEXT: lw a1, 0(a0) +; RV64IA-NEXT: mv a2, a1 +; RV64IA-NEXT: blez a1, .LBB61_4 ; RV64IA-NEXT: # %bb.3: # %else ; RV64IA-NEXT: li a2, 1 ; RV64IA-NEXT: .LBB61_4: # %else -; RV64IA-NEXT: sw a2, 0(a1) +; RV64IA-NEXT: sw a2, 0(a0) +; RV64IA-NEXT: mv a0, a1 ; RV64IA-NEXT: ret br i1 %c, label %then, label %else @@ -5244,31 +5238,31 @@ define signext i32 @atomicrmw_umax_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: andi a1, a1, 1 ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: andi a1, a1, 1 ; RV32I-NEXT: beqz a1, .LBB62_3 ; RV32I-NEXT: # %bb.1: # %then -; RV32I-NEXT: lw a1, 0(s0) +; RV32I-NEXT: lw a0, 0(s0) ; RV32I-NEXT: .LBB62_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: seqz a2, a1 -; RV32I-NEXT: add a2, a1, a2 -; RV32I-NEXT: sw a1, 4(sp) +; RV32I-NEXT: seqz a2, a0 +; RV32I-NEXT: add a2, a0, a2 +; RV32I-NEXT: sw a0, 4(sp) ; RV32I-NEXT: addi a1, sp, 4 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a1, 4(sp) -; RV32I-NEXT: beqz a0, .LBB62_2 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 4(sp) +; RV32I-NEXT: beqz a1, .LBB62_2 ; RV32I-NEXT: j .LBB62_4 ; RV32I-NEXT: .LBB62_3: # %else -; RV32I-NEXT: lw a1, 0(s0) -; RV32I-NEXT: seqz a0, a1 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: sw a0, 0(s0) +; RV32I-NEXT: lw a0, 0(s0) +; RV32I-NEXT: seqz a1, a0 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: sw a1, 0(s0) ; RV32I-NEXT: .LBB62_4: # %merge -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 @@ -5283,11 +5277,11 @@ define signext i32 @atomicrmw_umax_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV32IA-NEXT: amomaxu.w a0, a1, (a0) ; RV32IA-NEXT: ret ; RV32IA-NEXT: .LBB62_2: # %else -; RV32IA-NEXT: mv a1, a0 -; RV32IA-NEXT: lw a0, 0(a0) -; RV32IA-NEXT: seqz a2, a0 -; RV32IA-NEXT: add a2, a0, a2 -; RV32IA-NEXT: sw a2, 0(a1) +; RV32IA-NEXT: lw a1, 0(a0) +; RV32IA-NEXT: seqz a2, a1 +; RV32IA-NEXT: add a2, a1, a2 +; RV32IA-NEXT: sw a2, 0(a0) +; RV32IA-NEXT: mv a0, a1 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_umax_i32_monotonic_crossbb: @@ -5295,38 +5289,38 @@ define signext i32 @atomicrmw_umax_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64I-NEXT: addi sp, sp, -32 ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: andi a1, a1, 1 ; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: andi a1, a1, 1 ; RV64I-NEXT: beqz a1, .LBB62_5 ; RV64I-NEXT: # %bb.1: # %then -; RV64I-NEXT: lw a1, 0(s0) +; RV64I-NEXT: lw a0, 0(s0) ; RV64I-NEXT: j .LBB62_3 ; RV64I-NEXT: .LBB62_2: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB62_3 Depth=1 -; RV64I-NEXT: sw a1, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a1, 12(sp) -; RV64I-NEXT: bnez a0, .LBB62_6 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB62_6 ; RV64I-NEXT: .LBB62_3: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: li a0, 1 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bltu a0, a1, .LBB62_2 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu a1, a0, .LBB62_2 ; RV64I-NEXT: # %bb.4: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB62_3 Depth=1 ; RV64I-NEXT: li a2, 1 ; RV64I-NEXT: j .LBB62_2 ; RV64I-NEXT: .LBB62_5: # %else -; RV64I-NEXT: lw a1, 0(s0) -; RV64I-NEXT: seqz a0, a1 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: sw a0, 0(s0) +; RV64I-NEXT: lw a0, 0(s0) +; RV64I-NEXT: seqz a1, a0 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: sw a1, 0(s0) ; RV64I-NEXT: .LBB62_6: # %merge -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 32 @@ -5341,11 +5335,11 @@ define signext i32 @atomicrmw_umax_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64IA-NEXT: amomaxu.w a0, a1, (a0) ; RV64IA-NEXT: ret ; RV64IA-NEXT: .LBB62_2: # %else -; RV64IA-NEXT: mv a1, a0 -; RV64IA-NEXT: lw a0, 0(a0) -; RV64IA-NEXT: seqz a2, a0 -; RV64IA-NEXT: add a2, a0, a2 -; RV64IA-NEXT: sw a2, 0(a1) +; RV64IA-NEXT: lw a1, 0(a0) +; RV64IA-NEXT: seqz a2, a1 +; RV64IA-NEXT: add a2, a1, a2 +; RV64IA-NEXT: sw a2, 0(a0) +; RV64IA-NEXT: mv a0, a1 ; RV64IA-NEXT: ret br i1 %c, label %then, label %else @@ -5372,42 +5366,42 @@ define signext i32 @atomicrmw_umin_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: andi a1, a1, 1 ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: andi a1, a1, 1 ; RV32I-NEXT: beqz a1, .LBB63_5 ; RV32I-NEXT: # %bb.1: # %then -; RV32I-NEXT: lw a1, 0(s0) +; RV32I-NEXT: lw a0, 0(s0) ; RV32I-NEXT: li s1, 2 ; RV32I-NEXT: j .LBB63_3 ; RV32I-NEXT: .LBB63_2: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB63_3 Depth=1 -; RV32I-NEXT: sw a1, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: li a4, 0 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a1, 0(sp) -; RV32I-NEXT: bnez a0, .LBB63_8 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB63_8 ; RV32I-NEXT: .LBB63_3: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu a1, s1, .LBB63_2 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: bltu a0, s1, .LBB63_2 ; RV32I-NEXT: # %bb.4: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB63_3 Depth=1 ; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: j .LBB63_2 ; RV32I-NEXT: .LBB63_5: # %else -; RV32I-NEXT: lw a1, 0(s0) +; RV32I-NEXT: lw a0, 0(s0) ; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: bltu a1, a2, .LBB63_7 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: bltu a0, a2, .LBB63_7 ; RV32I-NEXT: # %bb.6: # %else -; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: li a1, 1 ; RV32I-NEXT: .LBB63_7: # %else -; RV32I-NEXT: sw a0, 0(s0) +; RV32I-NEXT: sw a1, 0(s0) ; RV32I-NEXT: .LBB63_8: # %merge -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -5416,22 +5410,22 @@ define signext i32 @atomicrmw_umin_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; ; RV32IA-LABEL: atomicrmw_umin_i32_monotonic_crossbb: ; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a1, 1 -; RV32IA-NEXT: mv a1, a0 -; RV32IA-NEXT: beqz a2, .LBB63_2 +; RV32IA-NEXT: andi a1, a1, 1 +; RV32IA-NEXT: beqz a1, .LBB63_2 ; RV32IA-NEXT: # %bb.1: # %then -; RV32IA-NEXT: li a0, 1 -; RV32IA-NEXT: amominu.w a0, a0, (a1) +; RV32IA-NEXT: li a1, 1 +; RV32IA-NEXT: amominu.w a0, a1, (a0) ; RV32IA-NEXT: ret ; RV32IA-NEXT: .LBB63_2: # %else -; RV32IA-NEXT: lw a0, 0(a1) +; RV32IA-NEXT: lw a1, 0(a0) ; RV32IA-NEXT: li a3, 1 -; RV32IA-NEXT: mv a2, a0 -; RV32IA-NEXT: bltu a0, a3, .LBB63_4 +; RV32IA-NEXT: mv a2, a1 +; RV32IA-NEXT: bltu a1, a3, .LBB63_4 ; RV32IA-NEXT: # %bb.3: # %else ; RV32IA-NEXT: li a2, 1 ; RV32IA-NEXT: .LBB63_4: # %else -; RV32IA-NEXT: sw a2, 0(a1) +; RV32IA-NEXT: sw a2, 0(a0) +; RV32IA-NEXT: mv a0, a1 ; RV32IA-NEXT: ret ; ; RV64I-LABEL: atomicrmw_umin_i32_monotonic_crossbb: @@ -5440,42 +5434,42 @@ define signext i32 @atomicrmw_umin_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: andi a1, a1, 1 ; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: andi a1, a1, 1 ; RV64I-NEXT: beqz a1, .LBB63_5 ; RV64I-NEXT: # %bb.1: # %then -; RV64I-NEXT: lw a1, 0(s0) +; RV64I-NEXT: lw a0, 0(s0) ; RV64I-NEXT: li s1, 2 ; RV64I-NEXT: j .LBB63_3 ; RV64I-NEXT: .LBB63_2: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB63_3 Depth=1 -; RV64I-NEXT: sw a1, 4(sp) +; RV64I-NEXT: sw a0, 4(sp) ; RV64I-NEXT: addi a1, sp, 4 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a1, 4(sp) -; RV64I-NEXT: bnez a0, .LBB63_8 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 4(sp) +; RV64I-NEXT: bnez a1, .LBB63_8 ; RV64I-NEXT: .LBB63_3: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: mv a2, a1 -; RV64I-NEXT: bltu a1, s1, .LBB63_2 +; RV64I-NEXT: mv a2, a0 +; RV64I-NEXT: bltu a0, s1, .LBB63_2 ; RV64I-NEXT: # %bb.4: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB63_3 Depth=1 ; RV64I-NEXT: li a2, 1 ; RV64I-NEXT: j .LBB63_2 ; RV64I-NEXT: .LBB63_5: # %else -; RV64I-NEXT: lw a1, 0(s0) +; RV64I-NEXT: lw a0, 0(s0) ; RV64I-NEXT: li a2, 1 -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: bltu a1, a2, .LBB63_7 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: bltu a0, a2, .LBB63_7 ; RV64I-NEXT: # %bb.6: # %else -; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: li a1, 1 ; RV64I-NEXT: .LBB63_7: # %else -; RV64I-NEXT: sw a0, 0(s0) +; RV64I-NEXT: sw a1, 0(s0) ; RV64I-NEXT: .LBB63_8: # %merge -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -5484,22 +5478,22 @@ define signext i32 @atomicrmw_umin_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; ; RV64IA-LABEL: atomicrmw_umin_i32_monotonic_crossbb: ; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a1, 1 -; RV64IA-NEXT: mv a1, a0 -; RV64IA-NEXT: beqz a2, .LBB63_2 +; RV64IA-NEXT: andi a1, a1, 1 +; RV64IA-NEXT: beqz a1, .LBB63_2 ; RV64IA-NEXT: # %bb.1: # %then -; RV64IA-NEXT: li a0, 1 -; RV64IA-NEXT: amominu.w a0, a0, (a1) +; RV64IA-NEXT: li a1, 1 +; RV64IA-NEXT: amominu.w a0, a1, (a0) ; RV64IA-NEXT: ret ; RV64IA-NEXT: .LBB63_2: # %else -; RV64IA-NEXT: lw a0, 0(a1) +; RV64IA-NEXT: lw a1, 0(a0) ; RV64IA-NEXT: li a3, 1 -; RV64IA-NEXT: mv a2, a0 -; RV64IA-NEXT: bltu a0, a3, .LBB63_4 +; RV64IA-NEXT: mv a2, a1 +; RV64IA-NEXT: bltu a1, a3, .LBB63_4 ; RV64IA-NEXT: # %bb.3: # %else ; RV64IA-NEXT: li a2, 1 ; RV64IA-NEXT: .LBB63_4: # %else -; RV64IA-NEXT: sw a2, 0(a1) +; RV64IA-NEXT: sw a2, 0(a0) +; RV64IA-NEXT: mv a0, a1 ; RV64IA-NEXT: ret br i1 %c, label %then, label %else diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll index 34b29ea1dc6c2..82e64c9cb5f65 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll @@ -26,27 +26,27 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: .LBB0_1: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: sltu a0, a0, s2 -; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: and a0, a0, s1 -; RV32I-NEXT: sub a2, a3, a0 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: sltu a1, a1, s2 +; RV32I-NEXT: addi a1, a1, -1 +; RV32I-NEXT: and a1, a1, s0 +; RV32I-NEXT: sub a2, a0, a1 +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: beqz a0, .LBB0_1 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: beqz a1, .LBB0_1 ; RV32I-NEXT: # %bb.2: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -65,9 +65,9 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; RV32IA-NEXT: slli a3, a0, 3 ; RV32IA-NEXT: li a4, 255 ; RV32IA-NEXT: andi a0, a3, 24 -; RV32IA-NEXT: lw a5, 0(a2) ; RV32IA-NEXT: sll a3, a4, a3 ; RV32IA-NEXT: not a3, a3 +; RV32IA-NEXT: lw a5, 0(a2) ; RV32IA-NEXT: andi a4, a1, 255 ; RV32IA-NEXT: .LBB0_1: # %atomicrmw.start ; RV32IA-NEXT: # =>This Loop Header: Depth=1 @@ -109,27 +109,27 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 ; RV64I-NEXT: .cfi_offset s2, -32 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: .LBB0_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: sltu a0, a0, s2 -; RV64I-NEXT: addi a0, a0, -1 -; RV64I-NEXT: and a0, a0, s1 -; RV64I-NEXT: sub a2, a3, a0 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: sltu a1, a1, s2 +; RV64I-NEXT: addi a1, a1, -1 +; RV64I-NEXT: and a1, a1, s0 +; RV64I-NEXT: sub a2, a0, a1 +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: beqz a0, .LBB0_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: beqz a1, .LBB0_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -145,18 +145,18 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; RV64IA-LABEL: atomicrmw_usub_cond_i8: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a4, a0, 3 -; RV64IA-NEXT: li a5, 255 -; RV64IA-NEXT: andi a0, a4, 24 -; RV64IA-NEXT: lw a3, 0(a2) -; RV64IA-NEXT: sllw a4, a5, a4 -; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: slli a3, a0, 3 +; RV64IA-NEXT: li a4, 255 +; RV64IA-NEXT: andi a0, a3, 24 +; RV64IA-NEXT: sllw a3, a4, a3 +; RV64IA-NEXT: not a3, a3 +; RV64IA-NEXT: lw a4, 0(a2) ; RV64IA-NEXT: andi a5, a1, 255 ; RV64IA-NEXT: .LBB0_1: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB0_3 Depth 2 -; RV64IA-NEXT: srlw a6, a3, a0 -; RV64IA-NEXT: sext.w a7, a3 +; RV64IA-NEXT: srlw a6, a4, a0 +; RV64IA-NEXT: sext.w a7, a4 ; RV64IA-NEXT: andi t0, a6, 255 ; RV64IA-NEXT: sltu t0, t0, a5 ; RV64IA-NEXT: addi t0, t0, -1 @@ -164,20 +164,20 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: subw a6, a6, t0 ; RV64IA-NEXT: andi a6, a6, 255 ; RV64IA-NEXT: sllw a6, a6, a0 -; RV64IA-NEXT: and a3, a3, a4 -; RV64IA-NEXT: or a6, a3, a6 +; RV64IA-NEXT: and a4, a4, a3 +; RV64IA-NEXT: or a6, a4, a6 ; RV64IA-NEXT: .LBB0_3: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB0_1 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a3, (a2) -; RV64IA-NEXT: bne a3, a7, .LBB0_1 +; RV64IA-NEXT: lr.w.aqrl a4, (a2) +; RV64IA-NEXT: bne a4, a7, .LBB0_1 ; RV64IA-NEXT: # %bb.4: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB0_3 Depth=2 ; RV64IA-NEXT: sc.w.rl t0, a6, (a2) ; RV64IA-NEXT: bnez t0, .LBB0_3 ; RV64IA-NEXT: # %bb.5: # %atomicrmw.start ; RV64IA-NEXT: # %bb.2: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a3, a0 +; RV64IA-NEXT: srlw a0, a4, a0 ; RV64IA-NEXT: ret %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst ret i8 %result @@ -200,27 +200,27 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; RV32I-NEXT: .cfi_offset s3, -20 ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: .LBB1_1: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: sltu a0, a0, s3 -; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: and a0, a0, s0 -; RV32I-NEXT: sub a2, a1, a0 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: sltu a1, a1, s3 +; RV32I-NEXT: addi a1, a1, -1 +; RV32I-NEXT: and a1, a1, s0 +; RV32I-NEXT: sub a2, a0, a1 +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: beqz a0, .LBB1_1 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: beqz a1, .LBB1_1 ; RV32I-NEXT: # %bb.2: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -242,9 +242,9 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: addi a3, a3, -1 -; RV32IA-NEXT: lw a6, 0(a2) ; RV32IA-NEXT: sll a4, a3, a4 ; RV32IA-NEXT: not a4, a4 +; RV32IA-NEXT: lw a6, 0(a2) ; RV32IA-NEXT: and a5, a1, a3 ; RV32IA-NEXT: .LBB1_1: # %atomicrmw.start ; RV32IA-NEXT: # =>This Loop Header: Depth=1 @@ -290,27 +290,27 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; RV64I-NEXT: .cfi_offset s3, -40 ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: .LBB1_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: sltu a0, a0, s3 -; RV64I-NEXT: addi a0, a0, -1 -; RV64I-NEXT: and a0, a0, s0 -; RV64I-NEXT: sub a2, a1, a0 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: sltu a1, a1, s3 +; RV64I-NEXT: addi a1, a1, -1 +; RV64I-NEXT: and a1, a1, s0 +; RV64I-NEXT: sub a2, a0, a1 +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: beqz a0, .LBB1_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: beqz a1, .LBB1_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -328,19 +328,19 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; RV64IA-LABEL: atomicrmw_usub_cond_i16: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a5, a0, 3 +; RV64IA-NEXT: slli a4, a0, 3 ; RV64IA-NEXT: lui a3, 16 -; RV64IA-NEXT: andi a0, a5, 24 +; RV64IA-NEXT: andi a0, a4, 24 ; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: lw a4, 0(a2) -; RV64IA-NEXT: sllw a5, a3, a5 -; RV64IA-NEXT: not a5, a5 +; RV64IA-NEXT: sllw a4, a3, a4 +; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: lw a5, 0(a2) ; RV64IA-NEXT: and a6, a1, a3 ; RV64IA-NEXT: .LBB1_1: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB1_3 Depth 2 -; RV64IA-NEXT: srlw a7, a4, a0 -; RV64IA-NEXT: sext.w t0, a4 +; RV64IA-NEXT: srlw a7, a5, a0 +; RV64IA-NEXT: sext.w t0, a5 ; RV64IA-NEXT: and t1, a7, a3 ; RV64IA-NEXT: sltu t1, t1, a6 ; RV64IA-NEXT: addi t1, t1, -1 @@ -348,20 +348,20 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: subw a7, a7, t1 ; RV64IA-NEXT: and a7, a7, a3 ; RV64IA-NEXT: sllw a7, a7, a0 -; RV64IA-NEXT: and a4, a4, a5 -; RV64IA-NEXT: or a7, a4, a7 +; RV64IA-NEXT: and a5, a5, a4 +; RV64IA-NEXT: or a7, a5, a7 ; RV64IA-NEXT: .LBB1_3: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB1_1 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a4, (a2) -; RV64IA-NEXT: bne a4, t0, .LBB1_1 +; RV64IA-NEXT: lr.w.aqrl a5, (a2) +; RV64IA-NEXT: bne a5, t0, .LBB1_1 ; RV64IA-NEXT: # %bb.4: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB1_3 Depth=2 ; RV64IA-NEXT: sc.w.rl t1, a7, (a2) ; RV64IA-NEXT: bnez t1, .LBB1_3 ; RV64IA-NEXT: # %bb.5: # %atomicrmw.start ; RV64IA-NEXT: # %bb.2: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a4, a0 +; RV64IA-NEXT: srlw a0, a5, a0 ; RV64IA-NEXT: ret %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst ret i16 %result @@ -378,25 +378,25 @@ define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) { ; RV32I-NEXT: .cfi_offset ra, -4 ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: .LBB2_1: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: sltu a0, a3, s1 -; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: and a0, a0, s1 -; RV32I-NEXT: sub a2, a3, a0 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sltu a1, a0, s0 +; RV32I-NEXT: addi a1, a1, -1 +; RV32I-NEXT: and a1, a1, s0 +; RV32I-NEXT: sub a2, a0, a1 +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: beqz a0, .LBB2_1 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: beqz a1, .LBB2_1 ; RV32I-NEXT: # %bb.2: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -444,26 +444,26 @@ define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) { ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 ; RV64I-NEXT: .cfi_offset s2, -32 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: .LBB2_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: sltu a0, a3, s2 -; RV64I-NEXT: addi a0, a0, -1 -; RV64I-NEXT: and a0, a0, s1 -; RV64I-NEXT: subw a2, a3, a0 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sltu a1, a0, s2 +; RV64I-NEXT: addi a1, a1, -1 +; RV64I-NEXT: and a1, a1, s0 +; RV64I-NEXT: subw a2, a0, a1 +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: beqz a0, .LBB2_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: beqz a1, .LBB2_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -519,43 +519,42 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB3_3 ; RV32I-NEXT: .LBB3_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: sltu a0, a5, s0 +; RV32I-NEXT: sltu a2, a1, s0 ; RV32I-NEXT: .LBB3_2: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: xori a0, a0, 1 -; RV32I-NEXT: neg a0, a0 -; RV32I-NEXT: and a1, a0, s2 -; RV32I-NEXT: and a0, a0, s0 -; RV32I-NEXT: sltu a3, a4, a1 -; RV32I-NEXT: sub a0, a5, a0 -; RV32I-NEXT: sub a2, a4, a1 -; RV32I-NEXT: sub a3, a0, a3 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: xori a2, a2, 1 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: and a3, a2, s1 +; RV32I-NEXT: and a2, a2, s0 +; RV32I-NEXT: sltu a4, a0, a3 +; RV32I-NEXT: sub a5, a1, a2 +; RV32I-NEXT: sub a2, a0, a3 +; RV32I-NEXT: sub a3, a5, a4 +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB3_5 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB3_5 ; RV32I-NEXT: .LBB3_3: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: bne a5, s0, .LBB3_1 +; RV32I-NEXT: bne a1, s0, .LBB3_1 ; RV32I-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: sltu a0, a4, s2 +; RV32I-NEXT: sltu a2, a0, s1 ; RV32I-NEXT: j .LBB3_2 ; RV32I-NEXT: .LBB3_5: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -581,43 +580,42 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: .cfi_offset s1, -12 ; RV32IA-NEXT: .cfi_offset s2, -16 ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB3_3 ; RV32IA-NEXT: .LBB3_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: sltu a0, a5, s0 +; RV32IA-NEXT: sltu a2, a1, s0 ; RV32IA-NEXT: .LBB3_2: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: xori a0, a0, 1 -; RV32IA-NEXT: neg a0, a0 -; RV32IA-NEXT: and a1, a0, s2 -; RV32IA-NEXT: and a0, a0, s0 -; RV32IA-NEXT: sltu a3, a4, a1 -; RV32IA-NEXT: sub a0, a5, a0 -; RV32IA-NEXT: sub a2, a4, a1 -; RV32IA-NEXT: sub a3, a0, a3 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: xori a2, a2, 1 +; RV32IA-NEXT: neg a2, a2 +; RV32IA-NEXT: and a3, a2, s1 +; RV32IA-NEXT: and a2, a2, s0 +; RV32IA-NEXT: sltu a4, a0, a3 +; RV32IA-NEXT: sub a5, a1, a2 +; RV32IA-NEXT: sub a2, a0, a3 +; RV32IA-NEXT: sub a3, a5, a4 +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB3_5 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB3_5 ; RV32IA-NEXT: .LBB3_3: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: bne a5, s0, .LBB3_1 +; RV32IA-NEXT: bne a1, s0, .LBB3_1 ; RV32IA-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: sltu a0, a4, s2 +; RV32IA-NEXT: sltu a2, a0, s1 ; RV32IA-NEXT: j .LBB3_2 ; RV32IA-NEXT: .LBB3_5: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -640,25 +638,25 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) { ; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: .LBB3_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: sltu a0, a3, s1 -; RV64I-NEXT: addi a0, a0, -1 -; RV64I-NEXT: and a0, a0, s1 -; RV64I-NEXT: sub a2, a3, a0 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sltu a1, a0, s0 +; RV64I-NEXT: addi a1, a1, -1 +; RV64I-NEXT: and a1, a1, s0 +; RV64I-NEXT: sub a2, a0, a1 +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: beqz a0, .LBB3_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: beqz a1, .LBB3_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -709,25 +707,25 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) { ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s1, a1, 255 ; RV32I-NEXT: .LBB4_1: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: sub a1, a0, s1 -; RV32I-NEXT: sltu a0, a0, a1 -; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: and a2, a0, a1 -; RV32I-NEXT: sb a3, 3(sp) +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: sub a2, a1, s1 +; RV32I-NEXT: sltu a1, a1, a2 +; RV32I-NEXT: addi a1, a1, -1 +; RV32I-NEXT: and a2, a1, a2 +; RV32I-NEXT: sb a0, 3(sp) ; RV32I-NEXT: addi a1, sp, 3 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 3(sp) -; RV32I-NEXT: beqz a0, .LBB4_1 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 3(sp) +; RV32I-NEXT: beqz a1, .LBB4_1 ; RV32I-NEXT: # %bb.2: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -741,12 +739,12 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) { ; RV32IA-LABEL: atomicrmw_usub_sat_i8: ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 -; RV32IA-NEXT: lw a4, 0(a2) -; RV32IA-NEXT: andi a0, a0, 24 +; RV32IA-NEXT: slli a3, a0, 3 +; RV32IA-NEXT: li a4, 255 +; RV32IA-NEXT: andi a0, a3, 24 +; RV32IA-NEXT: sll a3, a4, a3 ; RV32IA-NEXT: not a3, a3 +; RV32IA-NEXT: lw a4, 0(a2) ; RV32IA-NEXT: andi a1, a1, 255 ; RV32IA-NEXT: .LBB4_1: # %atomicrmw.start ; RV32IA-NEXT: # =>This Loop Header: Depth=1 @@ -786,25 +784,25 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) { ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s1, a1, 255 ; RV64I-NEXT: .LBB4_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: sub a1, a0, s1 -; RV64I-NEXT: sltu a0, a0, a1 -; RV64I-NEXT: addi a0, a0, -1 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: sub a2, a1, s1 +; RV64I-NEXT: sltu a1, a1, a2 +; RV64I-NEXT: addi a1, a1, -1 +; RV64I-NEXT: and a2, a1, a2 +; RV64I-NEXT: sb a0, 7(sp) ; RV64I-NEXT: addi a1, sp, 7 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 7(sp) -; RV64I-NEXT: beqz a0, .LBB4_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 7(sp) +; RV64I-NEXT: beqz a1, .LBB4_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -818,38 +816,38 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) { ; RV64IA-LABEL: atomicrmw_usub_sat_i8: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a4, a3, a0 -; RV64IA-NEXT: lw a3, 0(a2) -; RV64IA-NEXT: andi a0, a0, 24 -; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: slli a3, a0, 3 +; RV64IA-NEXT: li a4, 255 +; RV64IA-NEXT: andi a0, a3, 24 +; RV64IA-NEXT: sllw a3, a4, a3 +; RV64IA-NEXT: not a3, a3 +; RV64IA-NEXT: lw a4, 0(a2) ; RV64IA-NEXT: andi a1, a1, 255 ; RV64IA-NEXT: .LBB4_1: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB4_3 Depth 2 -; RV64IA-NEXT: srlw a5, a3, a0 -; RV64IA-NEXT: sext.w a6, a3 +; RV64IA-NEXT: srlw a5, a4, a0 +; RV64IA-NEXT: sext.w a6, a4 ; RV64IA-NEXT: andi a5, a5, 255 ; RV64IA-NEXT: sub a7, a5, a1 ; RV64IA-NEXT: sltu a5, a5, a7 ; RV64IA-NEXT: addi a5, a5, -1 ; RV64IA-NEXT: and a5, a5, a7 ; RV64IA-NEXT: sllw a5, a5, a0 -; RV64IA-NEXT: and a3, a3, a4 -; RV64IA-NEXT: or a5, a3, a5 +; RV64IA-NEXT: and a4, a4, a3 +; RV64IA-NEXT: or a5, a4, a5 ; RV64IA-NEXT: .LBB4_3: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB4_1 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a3, (a2) -; RV64IA-NEXT: bne a3, a6, .LBB4_1 +; RV64IA-NEXT: lr.w.aqrl a4, (a2) +; RV64IA-NEXT: bne a4, a6, .LBB4_1 ; RV64IA-NEXT: # %bb.4: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB4_3 Depth=2 ; RV64IA-NEXT: sc.w.rl a7, a5, (a2) ; RV64IA-NEXT: bnez a7, .LBB4_3 ; RV64IA-NEXT: # %bb.5: # %atomicrmw.start ; RV64IA-NEXT: # %bb.2: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a3, a0 +; RV64IA-NEXT: srlw a0, a4, a0 ; RV64IA-NEXT: ret %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst ret i8 %result @@ -869,27 +867,27 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) { ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s1, 16 ; RV32I-NEXT: addi s1, s1, -1 ; RV32I-NEXT: and s2, a1, s1 ; RV32I-NEXT: .LBB5_1: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a3, s1 -; RV32I-NEXT: sub a1, a0, s2 -; RV32I-NEXT: sltu a0, a0, a1 -; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: and a2, a0, a1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: and a1, a0, s1 +; RV32I-NEXT: sub a2, a1, s2 +; RV32I-NEXT: sltu a1, a1, a2 +; RV32I-NEXT: addi a1, a1, -1 +; RV32I-NEXT: and a2, a1, a2 +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: beqz a0, .LBB5_1 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: beqz a1, .LBB5_1 ; RV32I-NEXT: # %bb.2: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -909,9 +907,9 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) { ; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: addi a3, a3, -1 -; RV32IA-NEXT: lw a5, 0(a2) ; RV32IA-NEXT: sll a4, a3, a4 ; RV32IA-NEXT: not a4, a4 +; RV32IA-NEXT: lw a5, 0(a2) ; RV32IA-NEXT: and a1, a1, a3 ; RV32IA-NEXT: .LBB5_1: # %atomicrmw.start ; RV32IA-NEXT: # =>This Loop Header: Depth=1 @@ -953,27 +951,27 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) { ; RV64I-NEXT: .cfi_offset s1, -24 ; RV64I-NEXT: .cfi_offset s2, -32 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s1, 16 ; RV64I-NEXT: addiw s1, s1, -1 ; RV64I-NEXT: and s2, a1, s1 ; RV64I-NEXT: .LBB5_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a3, s1 -; RV64I-NEXT: sub a1, a0, s2 -; RV64I-NEXT: sltu a0, a0, a1 -; RV64I-NEXT: addi a0, a0, -1 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: and a1, a0, s1 +; RV64I-NEXT: sub a2, a1, s2 +; RV64I-NEXT: sltu a1, a1, a2 +; RV64I-NEXT: addi a1, a1, -1 +; RV64I-NEXT: and a2, a1, a2 +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: beqz a0, .LBB5_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: beqz a1, .LBB5_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -989,39 +987,39 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) { ; RV64IA-LABEL: atomicrmw_usub_sat_i16: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a5, a0, 3 +; RV64IA-NEXT: slli a4, a0, 3 ; RV64IA-NEXT: lui a3, 16 -; RV64IA-NEXT: andi a0, a5, 24 +; RV64IA-NEXT: andi a0, a4, 24 ; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: lw a4, 0(a2) -; RV64IA-NEXT: sllw a5, a3, a5 -; RV64IA-NEXT: not a5, a5 +; RV64IA-NEXT: sllw a4, a3, a4 +; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: lw a5, 0(a2) ; RV64IA-NEXT: and a1, a1, a3 ; RV64IA-NEXT: .LBB5_1: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB5_3 Depth 2 -; RV64IA-NEXT: srlw a6, a4, a0 -; RV64IA-NEXT: sext.w a7, a4 +; RV64IA-NEXT: srlw a6, a5, a0 +; RV64IA-NEXT: sext.w a7, a5 ; RV64IA-NEXT: and a6, a6, a3 ; RV64IA-NEXT: sub t0, a6, a1 ; RV64IA-NEXT: sltu a6, a6, t0 ; RV64IA-NEXT: addi a6, a6, -1 ; RV64IA-NEXT: and a6, a6, t0 ; RV64IA-NEXT: sllw a6, a6, a0 -; RV64IA-NEXT: and a4, a4, a5 -; RV64IA-NEXT: or a6, a4, a6 +; RV64IA-NEXT: and a5, a5, a4 +; RV64IA-NEXT: or a6, a5, a6 ; RV64IA-NEXT: .LBB5_3: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB5_1 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a4, (a2) -; RV64IA-NEXT: bne a4, a7, .LBB5_1 +; RV64IA-NEXT: lr.w.aqrl a5, (a2) +; RV64IA-NEXT: bne a5, a7, .LBB5_1 ; RV64IA-NEXT: # %bb.4: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB5_3 Depth=2 ; RV64IA-NEXT: sc.w.rl t0, a6, (a2) ; RV64IA-NEXT: bnez t0, .LBB5_3 ; RV64IA-NEXT: # %bb.5: # %atomicrmw.start ; RV64IA-NEXT: # %bb.2: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a4, a0 +; RV64IA-NEXT: srlw a0, a5, a0 ; RV64IA-NEXT: ret %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst ret i16 %result @@ -1038,25 +1036,25 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) { ; RV32I-NEXT: .cfi_offset ra, -4 ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: .LBB6_1: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: sub a0, a3, s1 -; RV32I-NEXT: sltu a1, a3, a0 -; RV32I-NEXT: addi a1, a1, -1 -; RV32I-NEXT: and a2, a1, a0 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sub a1, a0, s0 +; RV32I-NEXT: sltu a2, a0, a1 +; RV32I-NEXT: addi a2, a2, -1 +; RV32I-NEXT: and a2, a2, a1 +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: beqz a0, .LBB6_1 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: beqz a1, .LBB6_1 ; RV32I-NEXT: # %bb.2: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1102,25 +1100,25 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) { ; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: .LBB6_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: subw a0, a3, s1 -; RV64I-NEXT: sltu a1, a3, a0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a2, a1, a0 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: subw a1, a0, s0 +; RV64I-NEXT: sltu a2, a0, a1 +; RV64I-NEXT: addi a2, a2, -1 +; RV64I-NEXT: and a2, a2, a1 +; RV64I-NEXT: sw a0, 4(sp) ; RV64I-NEXT: addi a1, sp, 4 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) -; RV64I-NEXT: beqz a0, .LBB6_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 4(sp) +; RV64I-NEXT: beqz a1, .LBB6_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -1173,42 +1171,41 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB7_3 ; RV32I-NEXT: .LBB7_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_3 Depth=1 -; RV32I-NEXT: sltu a2, a5, a0 +; RV32I-NEXT: sltu a4, a1, a3 ; RV32I-NEXT: .LBB7_2: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_3 Depth=1 -; RV32I-NEXT: addi a3, a2, -1 -; RV32I-NEXT: and a2, a3, a1 -; RV32I-NEXT: and a3, a3, a0 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a4, a4, -1 +; RV32I-NEXT: and a2, a4, a2 +; RV32I-NEXT: and a3, a4, a3 +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB7_5 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB7_5 ; RV32I-NEXT: .LBB7_3: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: sltu a0, a4, s2 -; RV32I-NEXT: sub a1, a5, s0 -; RV32I-NEXT: sub a0, a1, a0 -; RV32I-NEXT: sub a1, a4, s2 -; RV32I-NEXT: bne a0, a5, .LBB7_1 +; RV32I-NEXT: sltu a2, a0, s1 +; RV32I-NEXT: sub a3, a1, s0 +; RV32I-NEXT: sub a3, a3, a2 +; RV32I-NEXT: sub a2, a0, s1 +; RV32I-NEXT: bne a3, a1, .LBB7_1 ; RV32I-NEXT: # %bb.4: # in Loop: Header=BB7_3 Depth=1 -; RV32I-NEXT: sltu a2, a4, a1 +; RV32I-NEXT: sltu a4, a0, a2 ; RV32I-NEXT: j .LBB7_2 ; RV32I-NEXT: .LBB7_5: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1234,42 +1231,41 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: .cfi_offset s1, -12 ; RV32IA-NEXT: .cfi_offset s2, -16 ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB7_3 ; RV32IA-NEXT: .LBB7_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_3 Depth=1 -; RV32IA-NEXT: sltu a2, a5, a0 +; RV32IA-NEXT: sltu a4, a1, a3 ; RV32IA-NEXT: .LBB7_2: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_3 Depth=1 -; RV32IA-NEXT: addi a3, a2, -1 -; RV32IA-NEXT: and a2, a3, a1 -; RV32IA-NEXT: and a3, a3, a0 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a4, a4, -1 +; RV32IA-NEXT: and a2, a4, a2 +; RV32IA-NEXT: and a3, a4, a3 +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB7_5 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB7_5 ; RV32IA-NEXT: .LBB7_3: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: sltu a0, a4, s2 -; RV32IA-NEXT: sub a1, a5, s0 -; RV32IA-NEXT: sub a0, a1, a0 -; RV32IA-NEXT: sub a1, a4, s2 -; RV32IA-NEXT: bne a0, a5, .LBB7_1 +; RV32IA-NEXT: sltu a2, a0, s1 +; RV32IA-NEXT: sub a3, a1, s0 +; RV32IA-NEXT: sub a3, a3, a2 +; RV32IA-NEXT: sub a2, a0, s1 +; RV32IA-NEXT: bne a3, a1, .LBB7_1 ; RV32IA-NEXT: # %bb.4: # in Loop: Header=BB7_3 Depth=1 -; RV32IA-NEXT: sltu a2, a4, a1 +; RV32IA-NEXT: sltu a4, a0, a2 ; RV32IA-NEXT: j .LBB7_2 ; RV32IA-NEXT: .LBB7_5: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1292,25 +1288,25 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) { ; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: .LBB7_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: sub a0, a3, s1 -; RV64I-NEXT: sltu a1, a3, a0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: and a2, a1, a0 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sub a1, a0, s0 +; RV64I-NEXT: sltu a2, a0, a1 +; RV64I-NEXT: addi a2, a2, -1 +; RV64I-NEXT: and a2, a2, a1 +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: beqz a0, .LBB7_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: beqz a1, .LBB7_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll index 3ff01e4987bd5..d67e047e8b05b 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll @@ -25,25 +25,25 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s1, a1, 255 ; RV32I-NEXT: .LBB0_1: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: addi a0, a3, 1 -; RV32I-NEXT: andi a1, a3, 255 -; RV32I-NEXT: sltu a1, a1, s1 -; RV32I-NEXT: neg a2, a1 -; RV32I-NEXT: and a2, a2, a0 -; RV32I-NEXT: sb a3, 3(sp) +; RV32I-NEXT: addi a1, a0, 1 +; RV32I-NEXT: andi a2, a0, 255 +; RV32I-NEXT: sltu a2, a2, s1 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: and a2, a2, a1 +; RV32I-NEXT: sb a0, 3(sp) ; RV32I-NEXT: addi a1, sp, 3 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 3(sp) -; RV32I-NEXT: beqz a0, .LBB0_1 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 3(sp) +; RV32I-NEXT: beqz a1, .LBB0_1 ; RV32I-NEXT: # %bb.2: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -57,12 +57,12 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV32IA-LABEL: atomicrmw_uinc_wrap_i8: ; RV32IA: # %bb.0: ; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 -; RV32IA-NEXT: lw a4, 0(a2) -; RV32IA-NEXT: andi a0, a0, 24 +; RV32IA-NEXT: slli a3, a0, 3 +; RV32IA-NEXT: li a4, 255 +; RV32IA-NEXT: andi a0, a3, 24 +; RV32IA-NEXT: sll a3, a4, a3 ; RV32IA-NEXT: not a3, a3 +; RV32IA-NEXT: lw a4, 0(a2) ; RV32IA-NEXT: andi a1, a1, 255 ; RV32IA-NEXT: .LBB0_1: # %atomicrmw.start ; RV32IA-NEXT: # =>This Loop Header: Depth=1 @@ -103,25 +103,25 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s1, a1, 255 ; RV64I-NEXT: .LBB0_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: addi a0, a3, 1 -; RV64I-NEXT: andi a1, a3, 255 -; RV64I-NEXT: sltu a1, a1, s1 -; RV64I-NEXT: neg a2, a1 -; RV64I-NEXT: and a2, a2, a0 -; RV64I-NEXT: sb a3, 7(sp) +; RV64I-NEXT: addi a1, a0, 1 +; RV64I-NEXT: andi a2, a0, 255 +; RV64I-NEXT: sltu a2, a2, s1 +; RV64I-NEXT: neg a2, a2 +; RV64I-NEXT: and a2, a2, a1 +; RV64I-NEXT: sb a0, 7(sp) ; RV64I-NEXT: addi a1, sp, 7 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 7(sp) -; RV64I-NEXT: beqz a0, .LBB0_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 7(sp) +; RV64I-NEXT: beqz a1, .LBB0_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -135,18 +135,18 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-LABEL: atomicrmw_uinc_wrap_i8: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a4, a3, a0 -; RV64IA-NEXT: lw a3, 0(a2) -; RV64IA-NEXT: andi a0, a0, 24 -; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: slli a3, a0, 3 +; RV64IA-NEXT: li a4, 255 +; RV64IA-NEXT: andi a0, a3, 24 +; RV64IA-NEXT: sllw a3, a4, a3 +; RV64IA-NEXT: not a3, a3 +; RV64IA-NEXT: lw a4, 0(a2) ; RV64IA-NEXT: andi a1, a1, 255 ; RV64IA-NEXT: .LBB0_1: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB0_3 Depth 2 -; RV64IA-NEXT: srlw a5, a3, a0 -; RV64IA-NEXT: sext.w a6, a3 +; RV64IA-NEXT: srlw a5, a4, a0 +; RV64IA-NEXT: sext.w a6, a4 ; RV64IA-NEXT: andi a7, a5, 255 ; RV64IA-NEXT: addi a5, a5, 1 ; RV64IA-NEXT: sltu a7, a7, a1 @@ -154,20 +154,20 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: and a5, a7, a5 ; RV64IA-NEXT: andi a5, a5, 255 ; RV64IA-NEXT: sllw a5, a5, a0 -; RV64IA-NEXT: and a3, a3, a4 -; RV64IA-NEXT: or a5, a3, a5 +; RV64IA-NEXT: and a4, a4, a3 +; RV64IA-NEXT: or a5, a4, a5 ; RV64IA-NEXT: .LBB0_3: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB0_1 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a3, (a2) -; RV64IA-NEXT: bne a3, a6, .LBB0_1 +; RV64IA-NEXT: lr.w.aqrl a4, (a2) +; RV64IA-NEXT: bne a4, a6, .LBB0_1 ; RV64IA-NEXT: # %bb.4: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB0_3 Depth=2 ; RV64IA-NEXT: sc.w.rl a7, a5, (a2) ; RV64IA-NEXT: bnez a7, .LBB0_3 ; RV64IA-NEXT: # %bb.5: # %atomicrmw.start ; RV64IA-NEXT: # %bb.2: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a3, a0 +; RV64IA-NEXT: srlw a0, a4, a0 ; RV64IA-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst ret i8 %result @@ -187,27 +187,27 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lhu a3, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s1, 16 ; RV32I-NEXT: addi s1, s1, -1 ; RV32I-NEXT: and s2, a1, s1 ; RV32I-NEXT: .LBB1_1: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a3, s1 -; RV32I-NEXT: addi a1, a3, 1 -; RV32I-NEXT: sltu a0, a0, s2 -; RV32I-NEXT: neg a2, a0 -; RV32I-NEXT: and a2, a2, a1 -; RV32I-NEXT: sh a3, 14(sp) +; RV32I-NEXT: and a1, a0, s1 +; RV32I-NEXT: addi a2, a0, 1 +; RV32I-NEXT: sltu a1, a1, s2 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: and a2, a1, a2 +; RV32I-NEXT: sh a0, 14(sp) ; RV32I-NEXT: addi a1, sp, 14 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a3, 14(sp) -; RV32I-NEXT: beqz a0, .LBB1_1 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 14(sp) +; RV32I-NEXT: beqz a1, .LBB1_1 ; RV32I-NEXT: # %bb.2: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -227,9 +227,9 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: addi a3, a3, -1 -; RV32IA-NEXT: lw a5, 0(a2) ; RV32IA-NEXT: sll a4, a3, a4 ; RV32IA-NEXT: not a4, a4 +; RV32IA-NEXT: lw a5, 0(a2) ; RV32IA-NEXT: and a1, a1, a3 ; RV32IA-NEXT: .LBB1_1: # %atomicrmw.start ; RV32IA-NEXT: # =>This Loop Header: Depth=1 @@ -272,27 +272,27 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64I-NEXT: .cfi_offset s1, -24 ; RV64I-NEXT: .cfi_offset s2, -32 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lhu a3, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s1, 16 ; RV64I-NEXT: addiw s1, s1, -1 ; RV64I-NEXT: and s2, a1, s1 ; RV64I-NEXT: .LBB1_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a3, s1 -; RV64I-NEXT: addi a1, a3, 1 -; RV64I-NEXT: sltu a0, a0, s2 -; RV64I-NEXT: neg a2, a0 -; RV64I-NEXT: and a2, a2, a1 -; RV64I-NEXT: sh a3, 14(sp) +; RV64I-NEXT: and a1, a0, s1 +; RV64I-NEXT: addi a2, a0, 1 +; RV64I-NEXT: sltu a1, a1, s2 +; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: and a2, a1, a2 +; RV64I-NEXT: sh a0, 14(sp) ; RV64I-NEXT: addi a1, sp, 14 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a3, 14(sp) -; RV64I-NEXT: beqz a0, .LBB1_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 14(sp) +; RV64I-NEXT: beqz a1, .LBB1_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -308,19 +308,19 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-LABEL: atomicrmw_uinc_wrap_i16: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a5, a0, 3 +; RV64IA-NEXT: slli a4, a0, 3 ; RV64IA-NEXT: lui a3, 16 -; RV64IA-NEXT: andi a0, a5, 24 +; RV64IA-NEXT: andi a0, a4, 24 ; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: lw a4, 0(a2) -; RV64IA-NEXT: sllw a5, a3, a5 -; RV64IA-NEXT: not a5, a5 +; RV64IA-NEXT: sllw a4, a3, a4 +; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: lw a5, 0(a2) ; RV64IA-NEXT: and a1, a1, a3 ; RV64IA-NEXT: .LBB1_1: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB1_3 Depth 2 -; RV64IA-NEXT: srlw a6, a4, a0 -; RV64IA-NEXT: sext.w a7, a4 +; RV64IA-NEXT: srlw a6, a5, a0 +; RV64IA-NEXT: sext.w a7, a5 ; RV64IA-NEXT: and t0, a6, a3 ; RV64IA-NEXT: addi a6, a6, 1 ; RV64IA-NEXT: sltu t0, t0, a1 @@ -328,20 +328,20 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: negw t0, t0 ; RV64IA-NEXT: and a6, t0, a6 ; RV64IA-NEXT: sllw a6, a6, a0 -; RV64IA-NEXT: and a4, a4, a5 -; RV64IA-NEXT: or a6, a4, a6 +; RV64IA-NEXT: and a5, a5, a4 +; RV64IA-NEXT: or a6, a5, a6 ; RV64IA-NEXT: .LBB1_3: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB1_1 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a4, (a2) -; RV64IA-NEXT: bne a4, a7, .LBB1_1 +; RV64IA-NEXT: lr.w.aqrl a5, (a2) +; RV64IA-NEXT: bne a5, a7, .LBB1_1 ; RV64IA-NEXT: # %bb.4: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB1_3 Depth=2 ; RV64IA-NEXT: sc.w.rl t0, a6, (a2) ; RV64IA-NEXT: bnez t0, .LBB1_3 ; RV64IA-NEXT: # %bb.5: # %atomicrmw.start ; RV64IA-NEXT: # %bb.2: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a4, a0 +; RV64IA-NEXT: srlw a0, a5, a0 ; RV64IA-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst ret i16 %result @@ -358,25 +358,25 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) { ; RV32I-NEXT: .cfi_offset ra, -4 ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: .LBB2_1: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: addi a0, a3, 1 -; RV32I-NEXT: sltu a1, a3, s1 -; RV32I-NEXT: neg a2, a1 -; RV32I-NEXT: and a2, a2, a0 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: addi a1, a0, 1 +; RV32I-NEXT: sltu a2, a0, s0 +; RV32I-NEXT: neg a2, a2 +; RV32I-NEXT: and a2, a2, a1 +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: beqz a0, .LBB2_1 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: beqz a1, .LBB2_1 ; RV32I-NEXT: # %bb.2: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -423,24 +423,24 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) { ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s1, a1 ; RV64I-NEXT: .LBB2_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: addiw a0, a3, 1 -; RV64I-NEXT: sltu a1, a3, s1 -; RV64I-NEXT: neg a2, a1 -; RV64I-NEXT: and a2, a2, a0 -; RV64I-NEXT: sw a3, 4(sp) +; RV64I-NEXT: addiw a1, a0, 1 +; RV64I-NEXT: sltu a2, a0, s1 +; RV64I-NEXT: neg a2, a2 +; RV64I-NEXT: and a2, a2, a1 +; RV64I-NEXT: sw a0, 4(sp) ; RV64I-NEXT: addi a1, sp, 4 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 4(sp) -; RV64I-NEXT: beqz a0, .LBB2_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 4(sp) +; RV64I-NEXT: beqz a1, .LBB2_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -494,41 +494,40 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB3_3 ; RV32I-NEXT: .LBB3_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: sltu a0, a5, s0 +; RV32I-NEXT: sltu a2, a1, s0 ; RV32I-NEXT: .LBB3_2: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: addi a1, a4, 1 -; RV32I-NEXT: neg a0, a0 -; RV32I-NEXT: seqz a3, a1 -; RV32I-NEXT: and a2, a0, a1 -; RV32I-NEXT: add a3, a5, a3 -; RV32I-NEXT: and a3, a0, a3 -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: addi a3, a0, 1 +; RV32I-NEXT: neg a4, a2 +; RV32I-NEXT: seqz a5, a3 +; RV32I-NEXT: and a2, a4, a3 +; RV32I-NEXT: add a3, a1, a5 +; RV32I-NEXT: and a3, a4, a3 +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a4, 8(sp) -; RV32I-NEXT: lw a5, 12(sp) -; RV32I-NEXT: bnez a0, .LBB3_5 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB3_5 ; RV32I-NEXT: .LBB3_3: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: bne a5, s0, .LBB3_1 +; RV32I-NEXT: bne a1, s0, .LBB3_1 ; RV32I-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 -; RV32I-NEXT: sltu a0, a4, s2 +; RV32I-NEXT: sltu a2, a0, s1 ; RV32I-NEXT: j .LBB3_2 ; RV32I-NEXT: .LBB3_5: # %atomicrmw.end -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -554,41 +553,40 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: .cfi_offset s1, -12 ; RV32IA-NEXT: .cfi_offset s2, -16 ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a4, 0(a0) -; RV32IA-NEXT: lw a5, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB3_3 ; RV32IA-NEXT: .LBB3_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: sltu a0, a5, s0 +; RV32IA-NEXT: sltu a2, a1, s0 ; RV32IA-NEXT: .LBB3_2: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: addi a1, a4, 1 -; RV32IA-NEXT: neg a0, a0 -; RV32IA-NEXT: seqz a3, a1 -; RV32IA-NEXT: and a2, a0, a1 -; RV32IA-NEXT: add a3, a5, a3 -; RV32IA-NEXT: and a3, a0, a3 -; RV32IA-NEXT: sw a4, 8(sp) -; RV32IA-NEXT: sw a5, 12(sp) +; RV32IA-NEXT: addi a3, a0, 1 +; RV32IA-NEXT: neg a4, a2 +; RV32IA-NEXT: seqz a5, a3 +; RV32IA-NEXT: and a2, a4, a3 +; RV32IA-NEXT: add a3, a1, a5 +; RV32IA-NEXT: and a3, a4, a3 +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a4, 8(sp) -; RV32IA-NEXT: lw a5, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB3_5 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB3_5 ; RV32IA-NEXT: .LBB3_3: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: bne a5, s0, .LBB3_1 +; RV32IA-NEXT: bne a1, s0, .LBB3_1 ; RV32IA-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 -; RV32IA-NEXT: sltu a0, a4, s2 +; RV32IA-NEXT: sltu a2, a0, s1 ; RV32IA-NEXT: j .LBB3_2 ; RV32IA-NEXT: .LBB3_5: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a4 -; RV32IA-NEXT: mv a1, a5 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -611,25 +609,25 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) { ; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: .LBB3_1: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: addi a0, a3, 1 -; RV64I-NEXT: sltu a1, a3, s1 -; RV64I-NEXT: neg a2, a1 -; RV64I-NEXT: and a2, a2, a0 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: addi a1, a0, 1 +; RV64I-NEXT: sltu a2, a0, s0 +; RV64I-NEXT: neg a2, a2 +; RV64I-NEXT: and a2, a2, a1 +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: beqz a0, .LBB3_1 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: beqz a1, .LBB3_1 ; RV64I-NEXT: # %bb.2: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -681,35 +679,35 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lbu a0, 0(a0) ; RV32I-NEXT: andi s2, a1, 255 ; RV32I-NEXT: j .LBB4_2 ; RV32I-NEXT: .LBB4_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB4_2 Depth=1 -; RV32I-NEXT: sb a3, 15(sp) +; RV32I-NEXT: sb a0, 15(sp) ; RV32I-NEXT: addi a1, sp, 15 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_1 -; RV32I-NEXT: lbu a3, 15(sp) -; RV32I-NEXT: bnez a0, .LBB4_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lbu a0, 15(sp) +; RV32I-NEXT: bnez a1, .LBB4_4 ; RV32I-NEXT: .LBB4_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: andi a0, a3, 255 -; RV32I-NEXT: seqz a1, a0 -; RV32I-NEXT: sltu a0, s2, a0 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: mv a2, s1 -; RV32I-NEXT: bnez a0, .LBB4_1 +; RV32I-NEXT: andi a1, a0, 255 +; RV32I-NEXT: seqz a2, a1 +; RV32I-NEXT: sltu a1, s2, a1 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: mv a2, s0 +; RV32I-NEXT: bnez a1, .LBB4_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB4_2 Depth=1 -; RV32I-NEXT: addi a2, a3, -1 +; RV32I-NEXT: addi a2, a0, -1 ; RV32I-NEXT: j .LBB4_1 ; RV32I-NEXT: .LBB4_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -728,9 +726,9 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV32IA-NEXT: slli a3, a0, 3 ; RV32IA-NEXT: li a4, 255 ; RV32IA-NEXT: andi a0, a3, 24 -; RV32IA-NEXT: lw a6, 0(a2) ; RV32IA-NEXT: sll a3, a4, a3 ; RV32IA-NEXT: not a3, a3 +; RV32IA-NEXT: lw a6, 0(a2) ; RV32IA-NEXT: andi a4, a1, 255 ; RV32IA-NEXT: j .LBB4_2 ; RV32IA-NEXT: .LBB4_1: # %atomicrmw.start @@ -782,35 +780,35 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 ; RV64I-NEXT: .cfi_offset s2, -32 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lbu a0, 0(a0) ; RV64I-NEXT: andi s2, a1, 255 ; RV64I-NEXT: j .LBB4_2 ; RV64I-NEXT: .LBB4_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB4_2 Depth=1 -; RV64I-NEXT: sb a3, 15(sp) +; RV64I-NEXT: sb a0, 15(sp) ; RV64I-NEXT: addi a1, sp, 15 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_1 -; RV64I-NEXT: lbu a3, 15(sp) -; RV64I-NEXT: bnez a0, .LBB4_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lbu a0, 15(sp) +; RV64I-NEXT: bnez a1, .LBB4_4 ; RV64I-NEXT: .LBB4_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: andi a0, a3, 255 -; RV64I-NEXT: seqz a1, a0 -; RV64I-NEXT: sltu a0, s2, a0 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: mv a2, s1 -; RV64I-NEXT: bnez a0, .LBB4_1 +; RV64I-NEXT: andi a1, a0, 255 +; RV64I-NEXT: seqz a2, a1 +; RV64I-NEXT: sltu a1, s2, a1 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: mv a2, s0 +; RV64I-NEXT: bnez a1, .LBB4_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB4_2 Depth=1 -; RV64I-NEXT: addi a2, a3, -1 +; RV64I-NEXT: addi a2, a0, -1 ; RV64I-NEXT: j .LBB4_1 ; RV64I-NEXT: .LBB4_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -826,37 +824,37 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-LABEL: atomicrmw_udec_wrap_i8: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a4, a0, 3 -; RV64IA-NEXT: li a5, 255 -; RV64IA-NEXT: andi a0, a4, 24 -; RV64IA-NEXT: lw a3, 0(a2) -; RV64IA-NEXT: sllw a4, a5, a4 -; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: slli a3, a0, 3 +; RV64IA-NEXT: li a4, 255 +; RV64IA-NEXT: andi a0, a3, 24 +; RV64IA-NEXT: sllw a3, a4, a3 +; RV64IA-NEXT: not a3, a3 +; RV64IA-NEXT: lw a4, 0(a2) ; RV64IA-NEXT: andi a5, a1, 255 ; RV64IA-NEXT: j .LBB4_2 ; RV64IA-NEXT: .LBB4_1: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB4_2 Depth=1 -; RV64IA-NEXT: sext.w a6, a3 +; RV64IA-NEXT: sext.w a6, a4 ; RV64IA-NEXT: andi a7, a7, 255 ; RV64IA-NEXT: sllw a7, a7, a0 -; RV64IA-NEXT: and a3, a3, a4 -; RV64IA-NEXT: or a7, a3, a7 +; RV64IA-NEXT: and a4, a4, a3 +; RV64IA-NEXT: or a7, a4, a7 ; RV64IA-NEXT: .LBB4_5: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB4_2 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a3, (a2) -; RV64IA-NEXT: bne a3, a6, .LBB4_7 +; RV64IA-NEXT: lr.w.aqrl a4, (a2) +; RV64IA-NEXT: bne a4, a6, .LBB4_7 ; RV64IA-NEXT: # %bb.6: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB4_5 Depth=2 ; RV64IA-NEXT: sc.w.rl t0, a7, (a2) ; RV64IA-NEXT: bnez t0, .LBB4_5 ; RV64IA-NEXT: .LBB4_7: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB4_2 Depth=1 -; RV64IA-NEXT: beq a3, a6, .LBB4_4 +; RV64IA-NEXT: beq a4, a6, .LBB4_4 ; RV64IA-NEXT: .LBB4_2: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB4_5 Depth 2 -; RV64IA-NEXT: srlw a6, a3, a0 +; RV64IA-NEXT: srlw a6, a4, a0 ; RV64IA-NEXT: andi a7, a6, 255 ; RV64IA-NEXT: seqz t0, a7 ; RV64IA-NEXT: sltu a7, a5, a7 @@ -868,7 +866,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: addi a7, a6, -1 ; RV64IA-NEXT: j .LBB4_1 ; RV64IA-NEXT: .LBB4_4: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a3, a0 +; RV64IA-NEXT: srlw a0, a4, a0 ; RV64IA-NEXT: ret %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst ret i8 %result @@ -891,35 +889,35 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV32I-NEXT: .cfi_offset s3, -20 ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 0(a0) ; RV32I-NEXT: lui s2, 16 ; RV32I-NEXT: addi s2, s2, -1 -; RV32I-NEXT: and s3, s0, s2 +; RV32I-NEXT: and s3, a1, s2 ; RV32I-NEXT: j .LBB5_2 ; RV32I-NEXT: .LBB5_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB5_2 Depth=1 -; RV32I-NEXT: sh a1, 10(sp) +; RV32I-NEXT: sh a0, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2 -; RV32I-NEXT: lh a1, 10(sp) -; RV32I-NEXT: bnez a0, .LBB5_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lh a0, 10(sp) +; RV32I-NEXT: bnez a1, .LBB5_4 ; RV32I-NEXT: .LBB5_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s2 -; RV32I-NEXT: seqz a2, a0 -; RV32I-NEXT: sltu a0, s3, a0 -; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: and a1, a0, s2 +; RV32I-NEXT: seqz a2, a1 +; RV32I-NEXT: sltu a1, s3, a1 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: mv a2, s0 -; RV32I-NEXT: bnez a0, .LBB5_1 +; RV32I-NEXT: bnez a1, .LBB5_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB5_2 Depth=1 -; RV32I-NEXT: addi a2, a1, -1 +; RV32I-NEXT: addi a2, a0, -1 ; RV32I-NEXT: j .LBB5_1 ; RV32I-NEXT: .LBB5_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -941,9 +939,9 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV32IA-NEXT: lui a3, 16 ; RV32IA-NEXT: andi a0, a4, 24 ; RV32IA-NEXT: addi a3, a3, -1 -; RV32IA-NEXT: lw a7, 0(a2) ; RV32IA-NEXT: sll a4, a3, a4 ; RV32IA-NEXT: not a4, a4 +; RV32IA-NEXT: lw a7, 0(a2) ; RV32IA-NEXT: and a5, a1, a3 ; RV32IA-NEXT: j .LBB5_2 ; RV32IA-NEXT: .LBB5_1: # %atomicrmw.start @@ -999,35 +997,35 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV64I-NEXT: .cfi_offset s3, -40 ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: lhu a1, 0(a0) +; RV64I-NEXT: lhu a0, 0(a0) ; RV64I-NEXT: lui s2, 16 ; RV64I-NEXT: addiw s2, s2, -1 -; RV64I-NEXT: and s3, s0, s2 +; RV64I-NEXT: and s3, a1, s2 ; RV64I-NEXT: j .LBB5_2 ; RV64I-NEXT: .LBB5_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB5_2 Depth=1 -; RV64I-NEXT: sh a1, 6(sp) +; RV64I-NEXT: sh a0, 6(sp) ; RV64I-NEXT: addi a1, sp, 6 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_2 -; RV64I-NEXT: lh a1, 6(sp) -; RV64I-NEXT: bnez a0, .LBB5_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lh a0, 6(sp) +; RV64I-NEXT: bnez a1, .LBB5_4 ; RV64I-NEXT: .LBB5_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: and a0, a1, s2 -; RV64I-NEXT: seqz a2, a0 -; RV64I-NEXT: sltu a0, s3, a0 -; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: and a1, a0, s2 +; RV64I-NEXT: seqz a2, a1 +; RV64I-NEXT: sltu a1, s3, a1 +; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: mv a2, s0 -; RV64I-NEXT: bnez a0, .LBB5_1 +; RV64I-NEXT: bnez a1, .LBB5_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB5_2 Depth=1 -; RV64I-NEXT: addi a2, a1, -1 +; RV64I-NEXT: addi a2, a0, -1 ; RV64I-NEXT: j .LBB5_1 ; RV64I-NEXT: .LBB5_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -1045,38 +1043,38 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-LABEL: atomicrmw_udec_wrap_i16: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a5, a0, 3 +; RV64IA-NEXT: slli a4, a0, 3 ; RV64IA-NEXT: lui a3, 16 -; RV64IA-NEXT: andi a0, a5, 24 +; RV64IA-NEXT: andi a0, a4, 24 ; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: lw a4, 0(a2) -; RV64IA-NEXT: sllw a5, a3, a5 -; RV64IA-NEXT: not a5, a5 +; RV64IA-NEXT: sllw a4, a3, a4 +; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: lw a5, 0(a2) ; RV64IA-NEXT: and a6, a1, a3 ; RV64IA-NEXT: j .LBB5_2 ; RV64IA-NEXT: .LBB5_1: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB5_2 Depth=1 -; RV64IA-NEXT: sext.w a7, a4 +; RV64IA-NEXT: sext.w a7, a5 ; RV64IA-NEXT: and t0, t0, a3 ; RV64IA-NEXT: sllw t0, t0, a0 -; RV64IA-NEXT: and a4, a4, a5 -; RV64IA-NEXT: or t0, a4, t0 +; RV64IA-NEXT: and a5, a5, a4 +; RV64IA-NEXT: or t0, a5, t0 ; RV64IA-NEXT: .LBB5_5: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB5_2 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a4, (a2) -; RV64IA-NEXT: bne a4, a7, .LBB5_7 +; RV64IA-NEXT: lr.w.aqrl a5, (a2) +; RV64IA-NEXT: bne a5, a7, .LBB5_7 ; RV64IA-NEXT: # %bb.6: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB5_5 Depth=2 ; RV64IA-NEXT: sc.w.rl t1, t0, (a2) ; RV64IA-NEXT: bnez t1, .LBB5_5 ; RV64IA-NEXT: .LBB5_7: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB5_2 Depth=1 -; RV64IA-NEXT: beq a4, a7, .LBB5_4 +; RV64IA-NEXT: beq a5, a7, .LBB5_4 ; RV64IA-NEXT: .LBB5_2: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB5_5 Depth 2 -; RV64IA-NEXT: srlw a7, a4, a0 +; RV64IA-NEXT: srlw a7, a5, a0 ; RV64IA-NEXT: and t0, a7, a3 ; RV64IA-NEXT: seqz t1, t0 ; RV64IA-NEXT: sltu t0, a6, t0 @@ -1088,7 +1086,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: addi t0, a7, -1 ; RV64IA-NEXT: j .LBB5_1 ; RV64IA-NEXT: .LBB5_4: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a4, a0 +; RV64IA-NEXT: srlw a0, a5, a0 ; RV64IA-NEXT: ret %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst ret i16 %result @@ -1105,33 +1103,33 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { ; RV32I-NEXT: .cfi_offset ra, -4 ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: j .LBB6_2 ; RV32I-NEXT: .LBB6_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB6_2 Depth=1 -; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a0, 0(sp) ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: li a3, 5 ; RV32I-NEXT: li a4, 5 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_4 -; RV32I-NEXT: lw a3, 0(sp) -; RV32I-NEXT: bnez a0, .LBB6_4 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: lw a0, 0(sp) +; RV32I-NEXT: bnez a1, .LBB6_4 ; RV32I-NEXT: .LBB6_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: seqz a0, a3 -; RV32I-NEXT: sltu a1, s1, a3 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: mv a2, s1 -; RV32I-NEXT: bnez a0, .LBB6_1 +; RV32I-NEXT: seqz a1, a0 +; RV32I-NEXT: sltu a2, s0, a0 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: mv a2, s0 +; RV32I-NEXT: bnez a1, .LBB6_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB6_2 Depth=1 -; RV32I-NEXT: addi a2, a3, -1 +; RV32I-NEXT: addi a2, a0, -1 ; RV32I-NEXT: j .LBB6_1 ; RV32I-NEXT: .LBB6_4: # %atomicrmw.end -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1189,34 +1187,34 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) { ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 ; RV64I-NEXT: .cfi_offset s2, -32 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lw a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: sext.w s2, a1 ; RV64I-NEXT: j .LBB6_2 ; RV64I-NEXT: .LBB6_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB6_2 Depth=1 -; RV64I-NEXT: sw a3, 12(sp) +; RV64I-NEXT: sw a0, 12(sp) ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_4 -; RV64I-NEXT: lw a3, 12(sp) -; RV64I-NEXT: bnez a0, .LBB6_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: lw a0, 12(sp) +; RV64I-NEXT: bnez a1, .LBB6_4 ; RV64I-NEXT: .LBB6_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: seqz a0, a3 -; RV64I-NEXT: sltu a1, s2, a3 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: mv a2, s1 -; RV64I-NEXT: bnez a0, .LBB6_1 +; RV64I-NEXT: seqz a1, a0 +; RV64I-NEXT: sltu a2, s2, a0 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: mv a2, s0 +; RV64I-NEXT: bnez a1, .LBB6_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB6_2 Depth=1 -; RV64I-NEXT: addiw a2, a3, -1 +; RV64I-NEXT: addiw a2, a0, -1 ; RV64I-NEXT: j .LBB6_1 ; RV64I-NEXT: .LBB6_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -1282,49 +1280,48 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32I-NEXT: .cfi_offset s1, -12 ; RV32I-NEXT: .cfi_offset s2, -16 ; RV32I-NEXT: mv s0, a2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: lw a5, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: mv s2, a1 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a1, 4(s2) ; RV32I-NEXT: j .LBB7_2 ; RV32I-NEXT: .LBB7_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 5 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __atomic_compare_exchange_8 -; RV32I-NEXT: lw a5, 8(sp) -; RV32I-NEXT: lw a4, 12(sp) -; RV32I-NEXT: bnez a0, .LBB7_7 +; RV32I-NEXT: mv a2, a0 +; RV32I-NEXT: lw a0, 8(sp) +; RV32I-NEXT: lw a1, 12(sp) +; RV32I-NEXT: bnez a2, .LBB7_7 ; RV32I-NEXT: .LBB7_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: beq a4, s0, .LBB7_4 +; RV32I-NEXT: beq a1, s0, .LBB7_4 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: sltu a0, s0, a4 +; RV32I-NEXT: sltu a2, s0, a1 ; RV32I-NEXT: j .LBB7_5 ; RV32I-NEXT: .LBB7_4: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: sltu a0, s2, a5 +; RV32I-NEXT: sltu a2, s1, a0 ; RV32I-NEXT: .LBB7_5: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: or a1, a5, a4 -; RV32I-NEXT: seqz a1, a1 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: mv a2, s2 +; RV32I-NEXT: or a3, a0, a1 +; RV32I-NEXT: seqz a3, a3 +; RV32I-NEXT: or a4, a3, a2 +; RV32I-NEXT: mv a2, s1 ; RV32I-NEXT: mv a3, s0 -; RV32I-NEXT: bnez a0, .LBB7_1 +; RV32I-NEXT: bnez a4, .LBB7_1 ; RV32I-NEXT: # %bb.6: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32I-NEXT: seqz a0, a5 -; RV32I-NEXT: sub a3, a4, a0 -; RV32I-NEXT: addi a2, a5, -1 +; RV32I-NEXT: seqz a2, a0 +; RV32I-NEXT: sub a3, a1, a2 +; RV32I-NEXT: addi a2, a0, -1 ; RV32I-NEXT: j .LBB7_1 ; RV32I-NEXT: .LBB7_7: # %atomicrmw.end -; RV32I-NEXT: mv a0, a5 -; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1350,49 +1347,48 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV32IA-NEXT: .cfi_offset s1, -12 ; RV32IA-NEXT: .cfi_offset s2, -16 ; RV32IA-NEXT: mv s0, a2 -; RV32IA-NEXT: mv s1, a0 -; RV32IA-NEXT: lw a5, 0(a0) -; RV32IA-NEXT: lw a4, 4(a0) -; RV32IA-NEXT: mv s2, a1 +; RV32IA-NEXT: mv s1, a1 +; RV32IA-NEXT: mv s2, a0 +; RV32IA-NEXT: lw a0, 0(a0) +; RV32IA-NEXT: lw a1, 4(s2) ; RV32IA-NEXT: j .LBB7_2 ; RV32IA-NEXT: .LBB7_1: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: sw a5, 8(sp) -; RV32IA-NEXT: sw a4, 12(sp) +; RV32IA-NEXT: sw a0, 8(sp) +; RV32IA-NEXT: sw a1, 12(sp) ; RV32IA-NEXT: addi a1, sp, 8 ; RV32IA-NEXT: li a4, 5 ; RV32IA-NEXT: li a5, 5 -; RV32IA-NEXT: mv a0, s1 +; RV32IA-NEXT: mv a0, s2 ; RV32IA-NEXT: call __atomic_compare_exchange_8 -; RV32IA-NEXT: lw a5, 8(sp) -; RV32IA-NEXT: lw a4, 12(sp) -; RV32IA-NEXT: bnez a0, .LBB7_7 +; RV32IA-NEXT: mv a2, a0 +; RV32IA-NEXT: lw a0, 8(sp) +; RV32IA-NEXT: lw a1, 12(sp) +; RV32IA-NEXT: bnez a2, .LBB7_7 ; RV32IA-NEXT: .LBB7_2: # %atomicrmw.start ; RV32IA-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32IA-NEXT: beq a4, s0, .LBB7_4 +; RV32IA-NEXT: beq a1, s0, .LBB7_4 ; RV32IA-NEXT: # %bb.3: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: sltu a0, s0, a4 +; RV32IA-NEXT: sltu a2, s0, a1 ; RV32IA-NEXT: j .LBB7_5 ; RV32IA-NEXT: .LBB7_4: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: sltu a0, s2, a5 +; RV32IA-NEXT: sltu a2, s1, a0 ; RV32IA-NEXT: .LBB7_5: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: or a1, a5, a4 -; RV32IA-NEXT: seqz a1, a1 -; RV32IA-NEXT: or a0, a1, a0 -; RV32IA-NEXT: mv a2, s2 +; RV32IA-NEXT: or a3, a0, a1 +; RV32IA-NEXT: seqz a3, a3 +; RV32IA-NEXT: or a4, a3, a2 +; RV32IA-NEXT: mv a2, s1 ; RV32IA-NEXT: mv a3, s0 -; RV32IA-NEXT: bnez a0, .LBB7_1 +; RV32IA-NEXT: bnez a4, .LBB7_1 ; RV32IA-NEXT: # %bb.6: # %atomicrmw.start ; RV32IA-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV32IA-NEXT: seqz a0, a5 -; RV32IA-NEXT: sub a3, a4, a0 -; RV32IA-NEXT: addi a2, a5, -1 +; RV32IA-NEXT: seqz a2, a0 +; RV32IA-NEXT: sub a3, a1, a2 +; RV32IA-NEXT: addi a2, a0, -1 ; RV32IA-NEXT: j .LBB7_1 ; RV32IA-NEXT: .LBB7_7: # %atomicrmw.end -; RV32IA-NEXT: mv a0, a5 -; RV32IA-NEXT: mv a1, a4 ; RV32IA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32IA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -1415,33 +1411,33 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: ld a3, 0(a0) -; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: j .LBB7_2 ; RV64I-NEXT: .LBB7_1: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: sd a0, 0(sp) ; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: li a3, 5 ; RV64I-NEXT: li a4, 5 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __atomic_compare_exchange_8 -; RV64I-NEXT: ld a3, 0(sp) -; RV64I-NEXT: bnez a0, .LBB7_4 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: ld a0, 0(sp) +; RV64I-NEXT: bnez a1, .LBB7_4 ; RV64I-NEXT: .LBB7_2: # %atomicrmw.start ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: seqz a0, a3 -; RV64I-NEXT: sltu a1, s1, a3 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: mv a2, s1 -; RV64I-NEXT: bnez a0, .LBB7_1 +; RV64I-NEXT: seqz a1, a0 +; RV64I-NEXT: sltu a2, s0, a0 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: mv a2, s0 +; RV64I-NEXT: bnez a1, .LBB7_1 ; RV64I-NEXT: # %bb.3: # %atomicrmw.start ; RV64I-NEXT: # in Loop: Header=BB7_2 Depth=1 -; RV64I-NEXT: addi a2, a3, -1 +; RV64I-NEXT: addi a2, a0, -1 ; RV64I-NEXT: j .LBB7_1 ; RV64I-NEXT: .LBB7_4: # %atomicrmw.end -; RV64I-NEXT: mv a0, a3 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/bf16-promote.ll b/llvm/test/CodeGen/RISCV/bf16-promote.ll index 08c053fab4f67..b3f04975d04c4 100644 --- a/llvm/test/CodeGen/RISCV/bf16-promote.ll +++ b/llvm/test/CodeGen/RISCV/bf16-promote.ll @@ -111,12 +111,12 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: mv s0, a0 -; RV64-NEXT: lhu a0, 0(a1) -; RV64-NEXT: lhu a1, 0(s0) -; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: lhu a0, 0(a0) +; RV64-NEXT: lhu a1, 0(a1) ; RV64-NEXT: slli a1, a1, 16 -; RV64-NEXT: fmv.w.x fa5, a0 -; RV64-NEXT: fmv.w.x fa4, a1 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: fmv.w.x fa5, a1 +; RV64-NEXT: fmv.w.x fa4, a0 ; RV64-NEXT: fadd.s fa0, fa4, fa5 ; RV64-NEXT: call __truncsfbf2 ; RV64-NEXT: fmv.x.w a0, fa0 @@ -132,12 +132,12 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lhu a0, 0(a1) -; RV32-NEXT: lhu a1, 0(s0) -; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: lhu a0, 0(a0) +; RV32-NEXT: lhu a1, 0(a1) ; RV32-NEXT: slli a1, a1, 16 -; RV32-NEXT: fmv.w.x fa5, a0 -; RV32-NEXT: fmv.w.x fa4, a1 +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: fmv.w.x fa5, a1 +; RV32-NEXT: fmv.w.x fa4, a0 ; RV32-NEXT: fadd.s fa0, fa4, fa5 ; RV32-NEXT: call __truncsfbf2 ; RV32-NEXT: fmv.x.w a0, fa0 diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll index 82359769c7c22..8621b3e980a04 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll @@ -51,13 +51,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; CHECK32ZFBFMIN-LABEL: fcvt_si_bf16_sat: ; CHECK32ZFBFMIN: # %bb.0: # %start ; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK32ZFBFMIN-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK32ZFBFMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK32ZFBFMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; CHECK32ZFBFMIN-NEXT: lui a0, 815104 -; CHECK32ZFBFMIN-NEXT: fmv.w.x fa3, a0 -; CHECK32ZFBFMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK32ZFBFMIN-NEXT: neg a0, a1 +; CHECK32ZFBFMIN-NEXT: lui a1, %hi(.LCPI1_0) +; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, a0 +; CHECK32ZFBFMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK32ZFBFMIN-NEXT: neg a0, a0 +; CHECK32ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32ZFBFMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; CHECK32ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32ZFBFMIN-NEXT: fcvt.w.s a1, fa5, rtz ; CHECK32ZFBFMIN-NEXT: and a0, a0, a1 @@ -70,11 +70,11 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; RV32ID-NEXT: fmv.w.x fa5, a1 ; RV32ID-NEXT: lui a1, %hi(.LCPI1_0) ; RV32ID-NEXT: slli a0, a0, 16 -; RV32ID-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV32ID-NEXT: fmv.w.x fa3, a0 -; RV32ID-NEXT: feq.s a0, fa3, fa3 -; RV32ID-NEXT: fmax.s fa5, fa3, fa5 +; RV32ID-NEXT: fmv.w.x fa4, a0 +; RV32ID-NEXT: feq.s a0, fa4, fa4 +; RV32ID-NEXT: fmax.s fa5, fa4, fa5 ; RV32ID-NEXT: neg a0, a0 +; RV32ID-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-NEXT: and a0, a0, a1 @@ -83,13 +83,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; CHECK64ZFBFMIN-LABEL: fcvt_si_bf16_sat: ; CHECK64ZFBFMIN: # %bb.0: # %start ; CHECK64ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK64ZFBFMIN-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK64ZFBFMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64ZFBFMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; CHECK64ZFBFMIN-NEXT: lui a0, 815104 -; CHECK64ZFBFMIN-NEXT: fmv.w.x fa3, a0 -; CHECK64ZFBFMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK64ZFBFMIN-NEXT: neg a0, a1 +; CHECK64ZFBFMIN-NEXT: lui a1, %hi(.LCPI1_0) +; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, a0 +; CHECK64ZFBFMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64ZFBFMIN-NEXT: neg a0, a0 +; CHECK64ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64ZFBFMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; CHECK64ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64ZFBFMIN-NEXT: fcvt.l.s a1, fa5, rtz ; CHECK64ZFBFMIN-NEXT: and a0, a0, a1 @@ -102,11 +102,11 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; RV64ID-NEXT: fmv.w.x fa5, a1 ; RV64ID-NEXT: lui a1, %hi(.LCPI1_0) ; RV64ID-NEXT: slli a0, a0, 16 -; RV64ID-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV64ID-NEXT: fmv.w.x fa3, a0 -; RV64ID-NEXT: feq.s a0, fa3, fa3 -; RV64ID-NEXT: fmax.s fa5, fa3, fa5 +; RV64ID-NEXT: fmv.w.x fa4, a0 +; RV64ID-NEXT: feq.s a0, fa4, fa4 +; RV64ID-NEXT: fmax.s fa5, fa4, fa5 ; RV64ID-NEXT: neg a0, a0 +; RV64ID-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-NEXT: and a0, a0, a1 @@ -152,49 +152,49 @@ define i16 @fcvt_ui_bf16(bfloat %a) nounwind { define i16 @fcvt_ui_bf16_sat(bfloat %a) nounwind { ; CHECK32ZFBFMIN-LABEL: fcvt_ui_bf16_sat: ; CHECK32ZFBFMIN: # %bb.0: # %start +; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, zero ; CHECK32ZFBFMIN-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK32ZFBFMIN-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 -; CHECK32ZFBFMIN-NEXT: fmv.w.x fa3, zero -; CHECK32ZFBFMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK32ZFBFMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK32ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32ZFBFMIN-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; CHECK32ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32ZFBFMIN-NEXT: fcvt.wu.s a0, fa5, rtz ; CHECK32ZFBFMIN-NEXT: ret ; ; RV32ID-LABEL: fcvt_ui_bf16_sat: ; RV32ID: # %bb.0: # %start -; RV32ID-NEXT: lui a0, %hi(.LCPI3_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI3_0)(a0) ; RV32ID-NEXT: fmv.x.w a0, fa0 +; RV32ID-NEXT: fmv.w.x fa5, zero ; RV32ID-NEXT: slli a0, a0, 16 ; RV32ID-NEXT: fmv.w.x fa4, a0 -; RV32ID-NEXT: fmv.w.x fa3, zero -; RV32ID-NEXT: fmax.s fa4, fa4, fa3 -; RV32ID-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-NEXT: lui a0, %hi(.LCPI3_0) +; RV32ID-NEXT: fmax.s fa5, fa4, fa5 +; RV32ID-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32ID-NEXT: ret ; ; CHECK64ZFBFMIN-LABEL: fcvt_ui_bf16_sat: ; CHECK64ZFBFMIN: # %bb.0: # %start +; CHECK64ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, zero ; CHECK64ZFBFMIN-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK64ZFBFMIN-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; CHECK64ZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 -; CHECK64ZFBFMIN-NEXT: fmv.w.x fa3, zero -; CHECK64ZFBFMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK64ZFBFMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK64ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64ZFBFMIN-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; CHECK64ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64ZFBFMIN-NEXT: fcvt.lu.s a0, fa5, rtz ; CHECK64ZFBFMIN-NEXT: ret ; ; RV64ID-LABEL: fcvt_ui_bf16_sat: ; RV64ID: # %bb.0: # %start -; RV64ID-NEXT: lui a0, %hi(.LCPI3_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI3_0)(a0) ; RV64ID-NEXT: fmv.x.w a0, fa0 +; RV64ID-NEXT: fmv.w.x fa5, zero ; RV64ID-NEXT: slli a0, a0, 16 ; RV64ID-NEXT: fmv.w.x fa4, a0 -; RV64ID-NEXT: fmv.w.x fa3, zero -; RV64ID-NEXT: fmax.s fa4, fa4, fa3 -; RV64ID-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-NEXT: lui a0, %hi(.LCPI3_0) +; RV64ID-NEXT: fmax.s fa5, fa4, fa5 +; RV64ID-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64ID-NEXT: ret start: @@ -647,14 +647,14 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind { ; CHECK32ZFBFMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; CHECK32ZFBFMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; CHECK32ZFBFMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa0, fa0 ; CHECK32ZFBFMIN-NEXT: lui a0, %hi(.LCPI12_0) +; CHECK32ZFBFMIN-NEXT: fmv.w.x fa5, zero +; CHECK32ZFBFMIN-NEXT: fle.s a1, fa5, fa0 ; CHECK32ZFBFMIN-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa0, fa0 -; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, zero -; CHECK32ZFBFMIN-NEXT: fle.s a0, fa4, fa0 -; CHECK32ZFBFMIN-NEXT: flt.s a1, fa5, fa0 -; CHECK32ZFBFMIN-NEXT: neg s0, a1 -; CHECK32ZFBFMIN-NEXT: neg s1, a0 +; CHECK32ZFBFMIN-NEXT: flt.s a0, fa5, fa0 +; CHECK32ZFBFMIN-NEXT: neg s0, a0 +; CHECK32ZFBFMIN-NEXT: neg s1, a1 ; CHECK32ZFBFMIN-NEXT: call __fixunssfdi ; CHECK32ZFBFMIN-NEXT: and a0, s1, a0 ; CHECK32ZFBFMIN-NEXT: and a1, s1, a1 @@ -675,11 +675,11 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind { ; RV32ID-NEXT: fmv.x.w a0, fa0 ; RV32ID-NEXT: lui a1, %hi(.LCPI12_0) ; RV32ID-NEXT: fmv.w.x fa5, zero -; RV32ID-NEXT: flw fa4, %lo(.LCPI12_0)(a1) ; RV32ID-NEXT: slli a0, a0, 16 ; RV32ID-NEXT: fmv.w.x fa0, a0 ; RV32ID-NEXT: fle.s a0, fa5, fa0 -; RV32ID-NEXT: flt.s a1, fa4, fa0 +; RV32ID-NEXT: flw fa5, %lo(.LCPI12_0)(a1) +; RV32ID-NEXT: flt.s a1, fa5, fa0 ; RV32ID-NEXT: neg s0, a1 ; RV32ID-NEXT: neg s1, a0 ; RV32ID-NEXT: call __fixunssfdi diff --git a/llvm/test/CodeGen/RISCV/bfloat-mem.ll b/llvm/test/CodeGen/RISCV/bfloat-mem.ll index f9cf4e523b77d..504a698615841 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-mem.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-mem.ll @@ -7,11 +7,11 @@ define bfloat @flh(ptr %a) nounwind { ; CHECK-LABEL: flh: ; CHECK: # %bb.0: -; CHECK-NEXT: flh fa5, 6(a0) -; CHECK-NEXT: flh fa4, 0(a0) -; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 +; CHECK-NEXT: flh fa5, 0(a0) +; CHECK-NEXT: flh fa4, 6(a0) ; CHECK-NEXT: fcvt.s.bf16 fa4, fa4 -; CHECK-NEXT: fadd.s fa5, fa4, fa5 +; CHECK-NEXT: fcvt.s.bf16 fa5, fa5 +; CHECK-NEXT: fadd.s fa5, fa5, fa4 ; CHECK-NEXT: fcvt.bf16.s fa0, fa5 ; CHECK-NEXT: ret %1 = load bfloat, ptr %a diff --git a/llvm/test/CodeGen/RISCV/bfloat.ll b/llvm/test/CodeGen/RISCV/bfloat.ll index c83b0ed6b0eee..1b93fdbbb68c2 100644 --- a/llvm/test/CodeGen/RISCV/bfloat.ll +++ b/llvm/test/CodeGen/RISCV/bfloat.ll @@ -447,12 +447,12 @@ define bfloat @bfloat_load(ptr %a) nounwind { ; RV32ID-ILP32: # %bb.0: ; RV32ID-ILP32-NEXT: addi sp, sp, -16 ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32ID-ILP32-NEXT: lhu a1, 6(a0) -; RV32ID-ILP32-NEXT: lhu a0, 0(a0) -; RV32ID-ILP32-NEXT: slli a1, a1, 16 +; RV32ID-ILP32-NEXT: lhu a1, 0(a0) +; RV32ID-ILP32-NEXT: lhu a0, 6(a0) ; RV32ID-ILP32-NEXT: slli a0, a0, 16 -; RV32ID-ILP32-NEXT: fmv.w.x fa5, a1 -; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 +; RV32ID-ILP32-NEXT: slli a1, a1, 16 +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, a1 ; RV32ID-ILP32-NEXT: fadd.s fa5, fa4, fa5 ; RV32ID-ILP32-NEXT: fmv.x.w a0, fa5 ; RV32ID-ILP32-NEXT: call __truncsfbf2 @@ -466,12 +466,12 @@ define bfloat @bfloat_load(ptr %a) nounwind { ; RV64ID-LP64: # %bb.0: ; RV64ID-LP64-NEXT: addi sp, sp, -16 ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64ID-LP64-NEXT: lhu a1, 6(a0) -; RV64ID-LP64-NEXT: lhu a0, 0(a0) -; RV64ID-LP64-NEXT: slli a1, a1, 16 +; RV64ID-LP64-NEXT: lhu a1, 0(a0) +; RV64ID-LP64-NEXT: lhu a0, 6(a0) ; RV64ID-LP64-NEXT: slli a0, a0, 16 -; RV64ID-LP64-NEXT: fmv.w.x fa5, a1 -; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 +; RV64ID-LP64-NEXT: slli a1, a1, 16 +; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 +; RV64ID-LP64-NEXT: fmv.w.x fa4, a1 ; RV64ID-LP64-NEXT: fadd.s fa5, fa4, fa5 ; RV64ID-LP64-NEXT: fmv.x.w a0, fa5 ; RV64ID-LP64-NEXT: call __truncsfbf2 @@ -485,12 +485,12 @@ define bfloat @bfloat_load(ptr %a) nounwind { ; RV32ID-ILP32D: # %bb.0: ; RV32ID-ILP32D-NEXT: addi sp, sp, -16 ; RV32ID-ILP32D-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32ID-ILP32D-NEXT: lhu a1, 6(a0) -; RV32ID-ILP32D-NEXT: lhu a0, 0(a0) -; RV32ID-ILP32D-NEXT: slli a1, a1, 16 +; RV32ID-ILP32D-NEXT: lhu a1, 0(a0) +; RV32ID-ILP32D-NEXT: lhu a0, 6(a0) ; RV32ID-ILP32D-NEXT: slli a0, a0, 16 -; RV32ID-ILP32D-NEXT: fmv.w.x fa5, a1 -; RV32ID-ILP32D-NEXT: fmv.w.x fa4, a0 +; RV32ID-ILP32D-NEXT: slli a1, a1, 16 +; RV32ID-ILP32D-NEXT: fmv.w.x fa5, a0 +; RV32ID-ILP32D-NEXT: fmv.w.x fa4, a1 ; RV32ID-ILP32D-NEXT: fadd.s fa0, fa4, fa5 ; RV32ID-ILP32D-NEXT: call __truncsfbf2 ; RV32ID-ILP32D-NEXT: fmv.x.w a0, fa0 @@ -505,12 +505,12 @@ define bfloat @bfloat_load(ptr %a) nounwind { ; RV64ID-LP64D: # %bb.0: ; RV64ID-LP64D-NEXT: addi sp, sp, -16 ; RV64ID-LP64D-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64ID-LP64D-NEXT: lhu a1, 6(a0) -; RV64ID-LP64D-NEXT: lhu a0, 0(a0) -; RV64ID-LP64D-NEXT: slli a1, a1, 16 +; RV64ID-LP64D-NEXT: lhu a1, 0(a0) +; RV64ID-LP64D-NEXT: lhu a0, 6(a0) ; RV64ID-LP64D-NEXT: slli a0, a0, 16 -; RV64ID-LP64D-NEXT: fmv.w.x fa5, a1 -; RV64ID-LP64D-NEXT: fmv.w.x fa4, a0 +; RV64ID-LP64D-NEXT: slli a1, a1, 16 +; RV64ID-LP64D-NEXT: fmv.w.x fa5, a0 +; RV64ID-LP64D-NEXT: fmv.w.x fa4, a1 ; RV64ID-LP64D-NEXT: fadd.s fa0, fa4, fa5 ; RV64ID-LP64D-NEXT: call __truncsfbf2 ; RV64ID-LP64D-NEXT: fmv.x.w a0, fa0 diff --git a/llvm/test/CodeGen/RISCV/bittest.ll b/llvm/test/CodeGen/RISCV/bittest.ll index d69ab0550a034..0564764c3f0bc 100644 --- a/llvm/test/CodeGen/RISCV/bittest.ll +++ b/llvm/test/CodeGen/RISCV/bittest.ll @@ -552,12 +552,12 @@ declare void @bar() define signext i32 @bit_10_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; CHECK-LABEL: bit_10_z_select_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: andi a3, a0, 1024 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: beqz a3, .LBB15_2 +; CHECK-NEXT: andi a0, a0, 1024 +; CHECK-NEXT: beqz a0, .LBB15_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB15_2: +; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: ret %1 = and i32 %a, 1024 %2 = icmp eq i32 %1, 0 @@ -568,22 +568,22 @@ define signext i32 @bit_10_z_select_i32(i32 signext %a, i32 signext %b, i32 sign define signext i32 @bit_10_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_10_nz_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 21 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bltz a3, .LBB16_2 +; RV32-NEXT: slli a0, a0, 21 +; RV32-NEXT: bltz a0, .LBB16_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB16_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_10_nz_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 53 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB16_2 +; RV64-NEXT: slli a0, a0, 53 +; RV64-NEXT: bltz a0, .LBB16_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB16_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 1024 %2 = icmp ne i32 %1, 0 @@ -594,22 +594,22 @@ define signext i32 @bit_10_nz_select_i32(i32 signext %a, i32 signext %b, i32 sig define signext i32 @bit_11_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_11_z_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 20 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bgez a3, .LBB17_2 +; RV32-NEXT: slli a0, a0, 20 +; RV32-NEXT: bgez a0, .LBB17_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB17_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_11_z_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 52 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bgez a3, .LBB17_2 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: bgez a0, .LBB17_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB17_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 2048 %2 = icmp eq i32 %1, 0 @@ -620,22 +620,22 @@ define signext i32 @bit_11_z_select_i32(i32 signext %a, i32 signext %b, i32 sign define signext i32 @bit_11_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_11_nz_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 20 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bltz a3, .LBB18_2 +; RV32-NEXT: slli a0, a0, 20 +; RV32-NEXT: bltz a0, .LBB18_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB18_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_11_nz_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 52 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB18_2 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: bltz a0, .LBB18_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB18_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 2048 %2 = icmp ne i32 %1, 0 @@ -646,22 +646,22 @@ define signext i32 @bit_11_nz_select_i32(i32 signext %a, i32 signext %b, i32 sig define signext i32 @bit_20_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_20_z_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 11 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bgez a3, .LBB19_2 +; RV32-NEXT: slli a0, a0, 11 +; RV32-NEXT: bgez a0, .LBB19_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB19_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_20_z_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 43 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bgez a3, .LBB19_2 +; RV64-NEXT: slli a0, a0, 43 +; RV64-NEXT: bgez a0, .LBB19_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB19_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 1048576 %2 = icmp eq i32 %1, 0 @@ -672,22 +672,22 @@ define signext i32 @bit_20_z_select_i32(i32 signext %a, i32 signext %b, i32 sign define signext i32 @bit_20_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_20_nz_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 11 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bltz a3, .LBB20_2 +; RV32-NEXT: slli a0, a0, 11 +; RV32-NEXT: bltz a0, .LBB20_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB20_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_20_nz_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 43 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB20_2 +; RV64-NEXT: slli a0, a0, 43 +; RV64-NEXT: bltz a0, .LBB20_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB20_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 1048576 %2 = icmp ne i32 %1, 0 @@ -708,12 +708,12 @@ define signext i32 @bit_31_z_select_i32(i32 signext %a, i32 signext %b, i32 sign ; RV64-LABEL: bit_31_z_select_i32: ; RV64: # %bb.0: ; RV64-NEXT: lui a3, 524288 -; RV64-NEXT: and a3, a0, a3 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB21_2 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: beqz a0, .LBB21_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB21_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 2147483648 %2 = icmp eq i32 %1, 0 @@ -724,23 +724,23 @@ define signext i32 @bit_31_z_select_i32(i32 signext %a, i32 signext %b, i32 sign define signext i32 @bit_31_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_31_nz_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: srli a3, a0, 31 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bnez a3, .LBB22_2 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: bnez a0, .LBB22_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB22_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_31_nz_select_i32: ; RV64: # %bb.0: ; RV64-NEXT: lui a3, 524288 -; RV64-NEXT: and a3, a0, a3 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB22_2 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: bnez a0, .LBB22_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB22_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 2147483648 %2 = icmp ne i32 %1, 0 @@ -752,23 +752,23 @@ define i64 @bit_10_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_10_z_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: andi a3, a0, 1024 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a3, .LBB23_2 +; RV32-NEXT: andi a0, a0, 1024 +; RV32-NEXT: beqz a0, .LBB23_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB23_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_10_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: andi a3, a0, 1024 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB23_2 +; RV64-NEXT: andi a0, a0, 1024 +; RV64-NEXT: beqz a0, .LBB23_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB23_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 1024 %2 = icmp eq i64 %1, 0 @@ -781,47 +781,47 @@ define i64 @bit_10_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I: # %bb.0: ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: slli a0, a0, 21 -; RV32I-NEXT: srli a3, a0, 31 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a3, .LBB24_2 +; RV32I-NEXT: srli a0, a0, 31 +; RV32I-NEXT: bnez a0, .LBB24_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: .LBB24_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; ; RV64-LABEL: bit_10_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 53 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB24_2 +; RV64-NEXT: slli a0, a0, 53 +; RV64-NEXT: bltz a0, .LBB24_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB24_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret ; ; RV32ZBS-LABEL: bit_10_nz_select_i64: ; RV32ZBS: # %bb.0: ; RV32ZBS-NEXT: mv a1, a3 -; RV32ZBS-NEXT: bexti a3, a0, 10 -; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: bnez a3, .LBB24_2 +; RV32ZBS-NEXT: bexti a0, a0, 10 +; RV32ZBS-NEXT: bnez a0, .LBB24_2 ; RV32ZBS-NEXT: # %bb.1: -; RV32ZBS-NEXT: mv a0, a4 +; RV32ZBS-NEXT: mv a2, a4 ; RV32ZBS-NEXT: mv a1, a5 ; RV32ZBS-NEXT: .LBB24_2: +; RV32ZBS-NEXT: mv a0, a2 ; RV32ZBS-NEXT: ret ; ; RV32XTHEADBS-LABEL: bit_10_nz_select_i64: ; RV32XTHEADBS: # %bb.0: ; RV32XTHEADBS-NEXT: mv a1, a3 -; RV32XTHEADBS-NEXT: th.tst a3, a0, 10 -; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: bnez a3, .LBB24_2 +; RV32XTHEADBS-NEXT: th.tst a0, a0, 10 +; RV32XTHEADBS-NEXT: bnez a0, .LBB24_2 ; RV32XTHEADBS-NEXT: # %bb.1: -; RV32XTHEADBS-NEXT: mv a0, a4 +; RV32XTHEADBS-NEXT: mv a2, a4 ; RV32XTHEADBS-NEXT: mv a1, a5 ; RV32XTHEADBS-NEXT: .LBB24_2: +; RV32XTHEADBS-NEXT: mv a0, a2 ; RV32XTHEADBS-NEXT: ret %1 = and i64 %a, 1024 %2 = icmp ne i64 %1, 0 @@ -833,23 +833,23 @@ define i64 @bit_11_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_11_z_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: slli a3, a0, 20 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bgez a3, .LBB25_2 +; RV32-NEXT: slli a0, a0, 20 +; RV32-NEXT: bgez a0, .LBB25_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB25_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_11_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 52 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bgez a3, .LBB25_2 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: bgez a0, .LBB25_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB25_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 2048 %2 = icmp eq i64 %1, 0 @@ -862,47 +862,47 @@ define i64 @bit_11_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I: # %bb.0: ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: slli a0, a0, 20 -; RV32I-NEXT: srli a3, a0, 31 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a3, .LBB26_2 +; RV32I-NEXT: srli a0, a0, 31 +; RV32I-NEXT: bnez a0, .LBB26_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: .LBB26_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; ; RV64-LABEL: bit_11_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 52 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB26_2 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: bltz a0, .LBB26_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB26_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret ; ; RV32ZBS-LABEL: bit_11_nz_select_i64: ; RV32ZBS: # %bb.0: ; RV32ZBS-NEXT: mv a1, a3 -; RV32ZBS-NEXT: bexti a3, a0, 11 -; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: bnez a3, .LBB26_2 +; RV32ZBS-NEXT: bexti a0, a0, 11 +; RV32ZBS-NEXT: bnez a0, .LBB26_2 ; RV32ZBS-NEXT: # %bb.1: -; RV32ZBS-NEXT: mv a0, a4 +; RV32ZBS-NEXT: mv a2, a4 ; RV32ZBS-NEXT: mv a1, a5 ; RV32ZBS-NEXT: .LBB26_2: +; RV32ZBS-NEXT: mv a0, a2 ; RV32ZBS-NEXT: ret ; ; RV32XTHEADBS-LABEL: bit_11_nz_select_i64: ; RV32XTHEADBS: # %bb.0: ; RV32XTHEADBS-NEXT: mv a1, a3 -; RV32XTHEADBS-NEXT: th.tst a3, a0, 11 -; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: bnez a3, .LBB26_2 +; RV32XTHEADBS-NEXT: th.tst a0, a0, 11 +; RV32XTHEADBS-NEXT: bnez a0, .LBB26_2 ; RV32XTHEADBS-NEXT: # %bb.1: -; RV32XTHEADBS-NEXT: mv a0, a4 +; RV32XTHEADBS-NEXT: mv a2, a4 ; RV32XTHEADBS-NEXT: mv a1, a5 ; RV32XTHEADBS-NEXT: .LBB26_2: +; RV32XTHEADBS-NEXT: mv a0, a2 ; RV32XTHEADBS-NEXT: ret %1 = and i64 %a, 2048 %2 = icmp ne i64 %1, 0 @@ -914,23 +914,23 @@ define i64 @bit_20_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_20_z_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: slli a3, a0, 11 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bgez a3, .LBB27_2 +; RV32-NEXT: slli a0, a0, 11 +; RV32-NEXT: bgez a0, .LBB27_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB27_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_20_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 43 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bgez a3, .LBB27_2 +; RV64-NEXT: slli a0, a0, 43 +; RV64-NEXT: bgez a0, .LBB27_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB27_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 1048576 %2 = icmp eq i64 %1, 0 @@ -943,47 +943,47 @@ define i64 @bit_20_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I: # %bb.0: ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: slli a0, a0, 11 -; RV32I-NEXT: srli a3, a0, 31 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a3, .LBB28_2 +; RV32I-NEXT: srli a0, a0, 31 +; RV32I-NEXT: bnez a0, .LBB28_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: .LBB28_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; ; RV64-LABEL: bit_20_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 43 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB28_2 +; RV64-NEXT: slli a0, a0, 43 +; RV64-NEXT: bltz a0, .LBB28_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB28_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret ; ; RV32ZBS-LABEL: bit_20_nz_select_i64: ; RV32ZBS: # %bb.0: ; RV32ZBS-NEXT: mv a1, a3 -; RV32ZBS-NEXT: bexti a3, a0, 20 -; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: bnez a3, .LBB28_2 +; RV32ZBS-NEXT: bexti a0, a0, 20 +; RV32ZBS-NEXT: bnez a0, .LBB28_2 ; RV32ZBS-NEXT: # %bb.1: -; RV32ZBS-NEXT: mv a0, a4 +; RV32ZBS-NEXT: mv a2, a4 ; RV32ZBS-NEXT: mv a1, a5 ; RV32ZBS-NEXT: .LBB28_2: +; RV32ZBS-NEXT: mv a0, a2 ; RV32ZBS-NEXT: ret ; ; RV32XTHEADBS-LABEL: bit_20_nz_select_i64: ; RV32XTHEADBS: # %bb.0: ; RV32XTHEADBS-NEXT: mv a1, a3 -; RV32XTHEADBS-NEXT: th.tst a3, a0, 20 -; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: bnez a3, .LBB28_2 +; RV32XTHEADBS-NEXT: th.tst a0, a0, 20 +; RV32XTHEADBS-NEXT: bnez a0, .LBB28_2 ; RV32XTHEADBS-NEXT: # %bb.1: -; RV32XTHEADBS-NEXT: mv a0, a4 +; RV32XTHEADBS-NEXT: mv a2, a4 ; RV32XTHEADBS-NEXT: mv a1, a5 ; RV32XTHEADBS-NEXT: .LBB28_2: +; RV32XTHEADBS-NEXT: mv a0, a2 ; RV32XTHEADBS-NEXT: ret %1 = and i64 %a, 1048576 %2 = icmp ne i64 %1, 0 @@ -1005,12 +1005,12 @@ define i64 @bit_31_z_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV64-LABEL: bit_31_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 32 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bgez a3, .LBB29_2 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: bgez a0, .LBB29_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB29_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 2147483648 %2 = icmp eq i64 %1, 0 @@ -1022,23 +1022,23 @@ define i64 @bit_31_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_31_nz_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: srli a3, a0, 31 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a3, .LBB30_2 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: bnez a0, .LBB30_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB30_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_31_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 32 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB30_2 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: bltz a0, .LBB30_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB30_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 2147483648 %2 = icmp ne i64 %1, 0 @@ -1049,8 +1049,8 @@ define i64 @bit_31_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_32_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_32_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: andi a1, a1, 1 ; RV32-NEXT: mv a0, a2 +; RV32-NEXT: andi a1, a1, 1 ; RV32-NEXT: beqz a1, .LBB31_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 @@ -1061,12 +1061,12 @@ define i64 @bit_32_z_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV64-LABEL: bit_32_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 31 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bgez a3, .LBB31_2 +; RV64-NEXT: slli a0, a0, 31 +; RV64-NEXT: bgez a0, .LBB31_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB31_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 4294967296 %2 = icmp eq i64 %1, 0 @@ -1077,8 +1077,8 @@ define i64 @bit_32_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_32_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_32_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: andi a1, a1, 1 ; RV32-NEXT: mv a0, a2 +; RV32-NEXT: andi a1, a1, 1 ; RV32-NEXT: bnez a1, .LBB32_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 @@ -1089,12 +1089,12 @@ define i64 @bit_32_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV64-LABEL: bit_32_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 31 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB32_2 +; RV64-NEXT: slli a0, a0, 31 +; RV64-NEXT: bltz a0, .LBB32_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB32_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 4294967296 %2 = icmp ne i64 %1, 0 @@ -1105,8 +1105,8 @@ define i64 @bit_32_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_55_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_55_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: slli a1, a1, 8 ; RV32-NEXT: mv a0, a2 +; RV32-NEXT: slli a1, a1, 8 ; RV32-NEXT: bgez a1, .LBB33_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 @@ -1117,12 +1117,12 @@ define i64 @bit_55_z_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV64-LABEL: bit_55_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 8 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bgez a3, .LBB33_2 +; RV64-NEXT: slli a0, a0, 8 +; RV64-NEXT: bgez a0, .LBB33_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB33_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 36028797018963968 %2 = icmp eq i64 %1, 0 @@ -1133,9 +1133,9 @@ define i64 @bit_55_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_55_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I-LABEL: bit_55_nz_select_i64: ; RV32I: # %bb.0: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: srli a1, a1, 31 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: bnez a1, .LBB34_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a4 @@ -1146,18 +1146,18 @@ define i64 @bit_55_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV64-LABEL: bit_55_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 8 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB34_2 +; RV64-NEXT: slli a0, a0, 8 +; RV64-NEXT: bltz a0, .LBB34_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB34_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret ; ; RV32ZBS-LABEL: bit_55_nz_select_i64: ; RV32ZBS: # %bb.0: -; RV32ZBS-NEXT: bexti a1, a1, 23 ; RV32ZBS-NEXT: mv a0, a2 +; RV32ZBS-NEXT: bexti a1, a1, 23 ; RV32ZBS-NEXT: bnez a1, .LBB34_2 ; RV32ZBS-NEXT: # %bb.1: ; RV32ZBS-NEXT: mv a0, a4 @@ -1168,8 +1168,8 @@ define i64 @bit_55_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV32XTHEADBS-LABEL: bit_55_nz_select_i64: ; RV32XTHEADBS: # %bb.0: -; RV32XTHEADBS-NEXT: th.tst a1, a1, 23 ; RV32XTHEADBS-NEXT: mv a0, a2 +; RV32XTHEADBS-NEXT: th.tst a1, a1, 23 ; RV32XTHEADBS-NEXT: bnez a1, .LBB34_2 ; RV32XTHEADBS-NEXT: # %bb.1: ; RV32XTHEADBS-NEXT: mv a0, a4 @@ -1212,8 +1212,8 @@ define i64 @bit_63_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_63_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_63_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: mv a0, a2 +; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: bnez a1, .LBB36_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a4 @@ -1224,12 +1224,12 @@ define i64 @bit_63_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV64-LABEL: bit_63_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: srli a3, a0, 63 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB36_2 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: bnez a0, .LBB36_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB36_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 9223372036854775808 %2 = icmp ne i64 %1, 0 @@ -1858,12 +1858,12 @@ define void @bit_63_nz_branch_i64(i64 %0) { define signext i32 @bit_10_1_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; CHECK-LABEL: bit_10_1_z_select_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: andi a3, a0, 1023 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: beqz a3, .LBB59_2 +; CHECK-NEXT: andi a0, a0, 1023 +; CHECK-NEXT: beqz a0, .LBB59_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB59_2: +; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: ret %1 = and i32 %a, 1023 %2 = icmp eq i32 %1, 0 @@ -1874,12 +1874,12 @@ define signext i32 @bit_10_1_z_select_i32(i32 signext %a, i32 signext %b, i32 si define signext i32 @bit_10_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; CHECK-LABEL: bit_10_1_nz_select_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: andi a3, a0, 1023 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: bnez a3, .LBB60_2 +; CHECK-NEXT: andi a0, a0, 1023 +; CHECK-NEXT: bnez a0, .LBB60_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB60_2: +; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: ret %1 = and i32 %a, 1023 %2 = icmp ne i32 %1, 0 @@ -1890,12 +1890,12 @@ define signext i32 @bit_10_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 s define signext i32 @bit_11_1_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; CHECK-LABEL: bit_11_1_z_select_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: andi a3, a0, 2047 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: beqz a3, .LBB61_2 +; CHECK-NEXT: andi a0, a0, 2047 +; CHECK-NEXT: beqz a0, .LBB61_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB61_2: +; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: ret %1 = and i32 %a, 2047 %2 = icmp eq i32 %1, 0 @@ -1906,12 +1906,12 @@ define signext i32 @bit_11_1_z_select_i32(i32 signext %a, i32 signext %b, i32 si define signext i32 @bit_11_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; CHECK-LABEL: bit_11_1_nz_select_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: andi a3, a0, 2047 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: bnez a3, .LBB62_2 +; CHECK-NEXT: andi a0, a0, 2047 +; CHECK-NEXT: bnez a0, .LBB62_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB62_2: +; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: ret %1 = and i32 %a, 2047 %2 = icmp ne i32 %1, 0 @@ -1922,22 +1922,22 @@ define signext i32 @bit_11_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 s define signext i32 @bit_16_1_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_16_1_z_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 16 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beqz a3, .LBB63_2 +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: beqz a0, .LBB63_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB63_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_16_1_z_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 48 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB63_2 +; RV64-NEXT: slli a0, a0, 48 +; RV64-NEXT: beqz a0, .LBB63_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB63_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 65535 %2 = icmp eq i32 %1, 0 @@ -1948,22 +1948,22 @@ define signext i32 @bit_16_1_z_select_i32(i32 signext %a, i32 signext %b, i32 si define signext i32 @bit_16_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_16_1_nz_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 16 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bnez a3, .LBB64_2 +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: bnez a0, .LBB64_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB64_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_16_1_nz_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 48 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB64_2 +; RV64-NEXT: slli a0, a0, 48 +; RV64-NEXT: bnez a0, .LBB64_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB64_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 65535 %2 = icmp ne i32 %1, 0 @@ -1974,22 +1974,22 @@ define signext i32 @bit_16_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 s define signext i32 @bit_20_1_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_20_1_z_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 12 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beqz a3, .LBB65_2 +; RV32-NEXT: slli a0, a0, 12 +; RV32-NEXT: beqz a0, .LBB65_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB65_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_20_1_z_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 44 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB65_2 +; RV64-NEXT: slli a0, a0, 44 +; RV64-NEXT: beqz a0, .LBB65_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB65_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 1048575 %2 = icmp eq i32 %1, 0 @@ -2000,22 +2000,22 @@ define signext i32 @bit_20_1_z_select_i32(i32 signext %a, i32 signext %b, i32 si define signext i32 @bit_20_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_20_1_nz_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 12 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bnez a3, .LBB66_2 +; RV32-NEXT: slli a0, a0, 12 +; RV32-NEXT: bnez a0, .LBB66_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB66_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_20_1_nz_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 44 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB66_2 +; RV64-NEXT: slli a0, a0, 44 +; RV64-NEXT: bnez a0, .LBB66_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB66_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 1048575 %2 = icmp ne i32 %1, 0 @@ -2026,22 +2026,22 @@ define signext i32 @bit_20_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 s define signext i32 @bit_31_1_z_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_31_1_z_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 1 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beqz a3, .LBB67_2 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: beqz a0, .LBB67_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB67_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_31_1_z_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 33 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB67_2 +; RV64-NEXT: slli a0, a0, 33 +; RV64-NEXT: beqz a0, .LBB67_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB67_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 2147483647 %2 = icmp eq i32 %1, 0 @@ -2052,22 +2052,22 @@ define signext i32 @bit_31_1_z_select_i32(i32 signext %a, i32 signext %b, i32 si define signext i32 @bit_31_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 signext %c) { ; RV32-LABEL: bit_31_1_nz_select_i32: ; RV32: # %bb.0: -; RV32-NEXT: slli a3, a0, 1 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: bnez a3, .LBB68_2 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: bnez a0, .LBB68_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a1, a2 ; RV32-NEXT: .LBB68_2: +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_31_1_nz_select_i32: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 33 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB68_2 +; RV64-NEXT: slli a0, a0, 33 +; RV64-NEXT: bnez a0, .LBB68_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB68_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i32 %a, 2147483647 %2 = icmp ne i32 %1, 0 @@ -2109,23 +2109,23 @@ define i64 @bit_10_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_10_1_z_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: andi a3, a0, 1023 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a3, .LBB71_2 +; RV32-NEXT: andi a0, a0, 1023 +; RV32-NEXT: beqz a0, .LBB71_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB71_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_10_1_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: andi a3, a0, 1023 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB71_2 +; RV64-NEXT: andi a0, a0, 1023 +; RV64-NEXT: beqz a0, .LBB71_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB71_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 1023 %2 = icmp eq i64 %1, 0 @@ -2137,23 +2137,23 @@ define i64 @bit_10_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_10_1_nz_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: andi a3, a0, 1023 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a3, .LBB72_2 +; RV32-NEXT: andi a0, a0, 1023 +; RV32-NEXT: bnez a0, .LBB72_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB72_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_10_1_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: andi a3, a0, 1023 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB72_2 +; RV64-NEXT: andi a0, a0, 1023 +; RV64-NEXT: bnez a0, .LBB72_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB72_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 1023 %2 = icmp ne i64 %1, 0 @@ -2165,23 +2165,23 @@ define i64 @bit_11_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_11_1_z_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: andi a3, a0, 2047 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a3, .LBB73_2 +; RV32-NEXT: andi a0, a0, 2047 +; RV32-NEXT: beqz a0, .LBB73_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB73_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_11_1_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: andi a3, a0, 2047 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB73_2 +; RV64-NEXT: andi a0, a0, 2047 +; RV64-NEXT: beqz a0, .LBB73_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB73_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 2047 %2 = icmp eq i64 %1, 0 @@ -2193,23 +2193,23 @@ define i64 @bit_11_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_11_1_nz_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: andi a3, a0, 2047 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a3, .LBB74_2 +; RV32-NEXT: andi a0, a0, 2047 +; RV32-NEXT: bnez a0, .LBB74_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB74_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_11_1_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: andi a3, a0, 2047 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB74_2 +; RV64-NEXT: andi a0, a0, 2047 +; RV64-NEXT: bnez a0, .LBB74_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB74_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 2047 %2 = icmp ne i64 %1, 0 @@ -2221,23 +2221,23 @@ define i64 @bit_16_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_16_1_z_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: slli a3, a0, 16 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a3, .LBB75_2 +; RV32-NEXT: slli a0, a0, 16 +; RV32-NEXT: beqz a0, .LBB75_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB75_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_16_1_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 48 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB75_2 +; RV64-NEXT: slli a0, a0, 48 +; RV64-NEXT: beqz a0, .LBB75_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB75_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 65535 %2 = icmp eq i64 %1, 0 @@ -2259,12 +2259,12 @@ define i64 @bit_16_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV64-LABEL: bit_16_1_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a3, a0 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB76_2 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: bnez a0, .LBB76_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB76_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 4294967295 %2 = icmp ne i64 %1, 0 @@ -2277,23 +2277,23 @@ define i64 @bit_20_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_20_1_z_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: slli a3, a0, 12 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a3, .LBB77_2 +; RV32-NEXT: slli a0, a0, 12 +; RV32-NEXT: beqz a0, .LBB77_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB77_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_20_1_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 44 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB77_2 +; RV64-NEXT: slli a0, a0, 44 +; RV64-NEXT: beqz a0, .LBB77_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB77_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 1048575 %2 = icmp eq i64 %1, 0 @@ -2305,23 +2305,23 @@ define i64 @bit_20_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_20_1_nz_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: slli a3, a0, 12 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a3, .LBB78_2 +; RV32-NEXT: slli a0, a0, 12 +; RV32-NEXT: bnez a0, .LBB78_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB78_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_20_1_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 44 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB78_2 +; RV64-NEXT: slli a0, a0, 44 +; RV64-NEXT: bnez a0, .LBB78_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB78_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 1048575 %2 = icmp ne i64 %1, 0 @@ -2333,23 +2333,23 @@ define i64 @bit_31_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_31_1_z_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: slli a3, a0, 1 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a3, .LBB79_2 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: beqz a0, .LBB79_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB79_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_31_1_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 33 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB79_2 +; RV64-NEXT: slli a0, a0, 33 +; RV64-NEXT: beqz a0, .LBB79_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB79_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 2147483647 %2 = icmp eq i64 %1, 0 @@ -2361,23 +2361,23 @@ define i64 @bit_31_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_31_1_nz_select_i64: ; RV32: # %bb.0: ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: slli a3, a0, 1 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a3, .LBB80_2 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: bnez a0, .LBB80_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a1, a5 ; RV32-NEXT: .LBB80_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_31_1_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 33 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB80_2 +; RV64-NEXT: slli a0, a0, 33 +; RV64-NEXT: bnez a0, .LBB80_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB80_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 2147483647 %2 = icmp ne i64 %1, 0 @@ -2399,12 +2399,12 @@ define i64 @bit_32_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV64-LABEL: bit_32_1_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a3, a0 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB81_2 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: beqz a0, .LBB81_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB81_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 4294967295 %2 = icmp eq i64 %1, 0 @@ -2426,12 +2426,12 @@ define i64 @bit_32_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; ; RV64-LABEL: bit_32_1_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a3, a0 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB82_2 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: bnez a0, .LBB82_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB82_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 4294967295 %2 = icmp ne i64 %1, 0 @@ -2444,24 +2444,24 @@ define i64 @bit_55_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32: # %bb.0: ; RV32-NEXT: slli a1, a1, 9 ; RV32-NEXT: srli a1, a1, 9 -; RV32-NEXT: or a1, a0, a1 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a1, .LBB83_2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: beqz a0, .LBB83_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB83_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_55_1_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 9 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB83_2 +; RV64-NEXT: slli a0, a0, 9 +; RV64-NEXT: beqz a0, .LBB83_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB83_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 36028797018963967 %2 = icmp eq i64 %1, 0 @@ -2474,24 +2474,24 @@ define i64 @bit_55_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32: # %bb.0: ; RV32-NEXT: slli a1, a1, 9 ; RV32-NEXT: srli a1, a1, 9 -; RV32-NEXT: or a1, a0, a1 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a1, .LBB84_2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: bnez a0, .LBB84_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB84_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: bit_55_1_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 9 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB84_2 +; RV64-NEXT: slli a0, a0, 9 +; RV64-NEXT: bnez a0, .LBB84_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB84_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %1 = and i64 %a, 36028797018963967 %2 = icmp ne i64 %1, 0 @@ -2504,36 +2504,36 @@ define i64 @bit_63_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: beqz a1, .LBB85_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: beqz a0, .LBB85_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB85_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64-LABEL: bit_63_1_z_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 1 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: beqz a3, .LBB85_2 +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: beqz a0, .LBB85_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB85_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret ; ; RV32ZBS-LABEL: bit_63_1_z_select_i64: ; RV32ZBS: # %bb.0: ; RV32ZBS-NEXT: bclri a1, a1, 31 -; RV32ZBS-NEXT: or a1, a0, a1 -; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: beqz a1, .LBB85_2 +; RV32ZBS-NEXT: or a0, a0, a1 +; RV32ZBS-NEXT: beqz a0, .LBB85_2 ; RV32ZBS-NEXT: # %bb.1: -; RV32ZBS-NEXT: mv a0, a4 +; RV32ZBS-NEXT: mv a2, a4 ; RV32ZBS-NEXT: mv a3, a5 ; RV32ZBS-NEXT: .LBB85_2: +; RV32ZBS-NEXT: mv a0, a2 ; RV32ZBS-NEXT: mv a1, a3 ; RV32ZBS-NEXT: ret ; @@ -2541,13 +2541,13 @@ define i64 @bit_63_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32XTHEADBS: # %bb.0: ; RV32XTHEADBS-NEXT: slli a1, a1, 1 ; RV32XTHEADBS-NEXT: srli a1, a1, 1 -; RV32XTHEADBS-NEXT: or a1, a0, a1 -; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: beqz a1, .LBB85_2 +; RV32XTHEADBS-NEXT: or a0, a0, a1 +; RV32XTHEADBS-NEXT: beqz a0, .LBB85_2 ; RV32XTHEADBS-NEXT: # %bb.1: -; RV32XTHEADBS-NEXT: mv a0, a4 +; RV32XTHEADBS-NEXT: mv a2, a4 ; RV32XTHEADBS-NEXT: mv a3, a5 ; RV32XTHEADBS-NEXT: .LBB85_2: +; RV32XTHEADBS-NEXT: mv a0, a2 ; RV32XTHEADBS-NEXT: mv a1, a3 ; RV32XTHEADBS-NEXT: ret %1 = and i64 %a, 9223372036854775807 @@ -2561,36 +2561,36 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a1, .LBB86_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: bnez a0, .LBB86_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB86_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; ; RV64-LABEL: bit_63_1_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 1 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bnez a3, .LBB86_2 +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: bnez a0, .LBB86_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB86_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret ; ; RV32ZBS-LABEL: bit_63_1_nz_select_i64: ; RV32ZBS: # %bb.0: ; RV32ZBS-NEXT: bclri a1, a1, 31 -; RV32ZBS-NEXT: or a1, a0, a1 -; RV32ZBS-NEXT: mv a0, a2 -; RV32ZBS-NEXT: bnez a1, .LBB86_2 +; RV32ZBS-NEXT: or a0, a0, a1 +; RV32ZBS-NEXT: bnez a0, .LBB86_2 ; RV32ZBS-NEXT: # %bb.1: -; RV32ZBS-NEXT: mv a0, a4 +; RV32ZBS-NEXT: mv a2, a4 ; RV32ZBS-NEXT: mv a3, a5 ; RV32ZBS-NEXT: .LBB86_2: +; RV32ZBS-NEXT: mv a0, a2 ; RV32ZBS-NEXT: mv a1, a3 ; RV32ZBS-NEXT: ret ; @@ -2598,13 +2598,13 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32XTHEADBS: # %bb.0: ; RV32XTHEADBS-NEXT: slli a1, a1, 1 ; RV32XTHEADBS-NEXT: srli a1, a1, 1 -; RV32XTHEADBS-NEXT: or a1, a0, a1 -; RV32XTHEADBS-NEXT: mv a0, a2 -; RV32XTHEADBS-NEXT: bnez a1, .LBB86_2 +; RV32XTHEADBS-NEXT: or a0, a0, a1 +; RV32XTHEADBS-NEXT: bnez a0, .LBB86_2 ; RV32XTHEADBS-NEXT: # %bb.1: -; RV32XTHEADBS-NEXT: mv a0, a4 +; RV32XTHEADBS-NEXT: mv a2, a4 ; RV32XTHEADBS-NEXT: mv a3, a5 ; RV32XTHEADBS-NEXT: .LBB86_2: +; RV32XTHEADBS-NEXT: mv a0, a2 ; RV32XTHEADBS-NEXT: mv a1, a3 ; RV32XTHEADBS-NEXT: ret %1 = and i64 %a, 9223372036854775807 @@ -2616,13 +2616,13 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_64_1_z_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_64_1_z_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: or a1, a0, a1 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: beqz a1, .LBB87_2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: beqz a0, .LBB87_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB87_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; @@ -2643,13 +2643,13 @@ define i64 @bit_64_1_z_select_i64(i64 %a, i64 %b, i64 %c) { define i64 @bit_64_1_nz_select_i64(i64 %a, i64 %b, i64 %c) { ; RV32-LABEL: bit_64_1_nz_select_i64: ; RV32: # %bb.0: -; RV32-NEXT: or a1, a0, a1 -; RV32-NEXT: mv a0, a2 -; RV32-NEXT: bnez a1, .LBB88_2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: bnez a0, .LBB88_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a2, a4 ; RV32-NEXT: mv a3, a5 ; RV32-NEXT: .LBB88_2: +; RV32-NEXT: mv a0, a2 ; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll index 02aeebdeb3775..de325010bb281 100644 --- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll +++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll @@ -76,8 +76,8 @@ define i32 @test_lshr(i32 %v) { ; RV32-NEXT: .LBB2_1: # %for.body ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: andi a2, a0, 1 -; RV32-NEXT: srli a0, a0, 1 ; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: srli a0, a0, 1 ; RV32-NEXT: bnez a0, .LBB2_1 ; RV32-NEXT: .LBB2_2: # %for.end ; RV32-NEXT: mv a0, a1 @@ -92,8 +92,8 @@ define i32 @test_lshr(i32 %v) { ; RV64-NEXT: .LBB2_2: # %for.body ; RV64-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64-NEXT: andi a2, a0, 1 -; RV64-NEXT: srliw a0, a0, 1 ; RV64-NEXT: addw a1, a1, a2 +; RV64-NEXT: srliw a0, a0, 1 ; RV64-NEXT: bnez a0, .LBB2_2 ; RV64-NEXT: .LBB2_3: # %for.end ; RV64-NEXT: mv a0, a1 @@ -129,9 +129,8 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) { ; RV32-NEXT: lw a3, 0(a1) ; RV32-NEXT: addi a4, a1, 4 ; RV32-NEXT: slli a3, a3, 1 -; RV32-NEXT: addi a1, a0, 4 ; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: mv a0, a1 +; RV32-NEXT: addi a0, a0, 4 ; RV32-NEXT: mv a1, a4 ; RV32-NEXT: bne a4, a2, .LBB3_2 ; RV32-NEXT: .LBB3_3: # %while.end @@ -153,9 +152,8 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) { ; RV64-NEXT: lw a3, 0(a1) ; RV64-NEXT: addi a4, a1, 4 ; RV64-NEXT: slli a3, a3, 1 -; RV64-NEXT: addi a1, a0, 4 ; RV64-NEXT: sw a3, 0(a0) -; RV64-NEXT: mv a0, a1 +; RV64-NEXT: addi a0, a0, 4 ; RV64-NEXT: mv a1, a4 ; RV64-NEXT: bne a4, a2, .LBB3_2 ; RV64-NEXT: .LBB3_3: # %while.end diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll index 337e9bc5845f9..88ad8e6930287 100644 --- a/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll +++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr32s.ll @@ -53,22 +53,22 @@ define void @callee() nounwind { ; ILP32-NEXT: flw fs1, 84(a1) ; ILP32-NEXT: flw fs2, 88(a1) ; ILP32-NEXT: flw fs3, 92(a1) -; ILP32-NEXT: flw fs4, 112(a1) -; ILP32-NEXT: flw fs5, 116(a1) -; ILP32-NEXT: flw fs6, 120(a1) -; ILP32-NEXT: flw fs7, 124(a1) -; ILP32-NEXT: flw fs8, 96(a1) -; ILP32-NEXT: flw fs9, 100(a1) -; ILP32-NEXT: flw fs10, 104(a1) -; ILP32-NEXT: flw fs11, 108(a1) -; ILP32-NEXT: fsw fs7, 124(a1) -; ILP32-NEXT: fsw fs6, 120(a1) -; ILP32-NEXT: fsw fs5, 116(a1) -; ILP32-NEXT: fsw fs4, 112(a1) -; ILP32-NEXT: fsw fs11, 108(a1) -; ILP32-NEXT: fsw fs10, 104(a1) -; ILP32-NEXT: fsw fs9, 100(a1) -; ILP32-NEXT: fsw fs8, 96(a1) +; ILP32-NEXT: flw fs4, 96(a1) +; ILP32-NEXT: flw fs5, 100(a1) +; ILP32-NEXT: flw fs6, 104(a1) +; ILP32-NEXT: flw fs7, 108(a1) +; ILP32-NEXT: flw fs8, 112(a1) +; ILP32-NEXT: flw fs9, 116(a1) +; ILP32-NEXT: flw fs10, 120(a1) +; ILP32-NEXT: flw fs11, 124(a1) +; ILP32-NEXT: fsw fs11, 124(a1) +; ILP32-NEXT: fsw fs10, 120(a1) +; ILP32-NEXT: fsw fs9, 116(a1) +; ILP32-NEXT: fsw fs8, 112(a1) +; ILP32-NEXT: fsw fs7, 108(a1) +; ILP32-NEXT: fsw fs6, 104(a1) +; ILP32-NEXT: fsw fs5, 100(a1) +; ILP32-NEXT: fsw fs4, 96(a1) ; ILP32-NEXT: fsw fs3, 92(a1) ; ILP32-NEXT: fsw fs2, 88(a1) ; ILP32-NEXT: fsw fs1, 84(a1) @@ -123,22 +123,22 @@ define void @callee() nounwind { ; ILP32E-NEXT: flw fs1, 84(a1) ; ILP32E-NEXT: flw fs2, 88(a1) ; ILP32E-NEXT: flw fs3, 92(a1) -; ILP32E-NEXT: flw fs4, 112(a1) -; ILP32E-NEXT: flw fs5, 116(a1) -; ILP32E-NEXT: flw fs6, 120(a1) -; ILP32E-NEXT: flw fs7, 124(a1) -; ILP32E-NEXT: flw fs8, 96(a1) -; ILP32E-NEXT: flw fs9, 100(a1) -; ILP32E-NEXT: flw fs10, 104(a1) -; ILP32E-NEXT: flw fs11, 108(a1) -; ILP32E-NEXT: fsw fs7, 124(a1) -; ILP32E-NEXT: fsw fs6, 120(a1) -; ILP32E-NEXT: fsw fs5, 116(a1) -; ILP32E-NEXT: fsw fs4, 112(a1) -; ILP32E-NEXT: fsw fs11, 108(a1) -; ILP32E-NEXT: fsw fs10, 104(a1) -; ILP32E-NEXT: fsw fs9, 100(a1) -; ILP32E-NEXT: fsw fs8, 96(a1) +; ILP32E-NEXT: flw fs4, 96(a1) +; ILP32E-NEXT: flw fs5, 100(a1) +; ILP32E-NEXT: flw fs6, 104(a1) +; ILP32E-NEXT: flw fs7, 108(a1) +; ILP32E-NEXT: flw fs8, 112(a1) +; ILP32E-NEXT: flw fs9, 116(a1) +; ILP32E-NEXT: flw fs10, 120(a1) +; ILP32E-NEXT: flw fs11, 124(a1) +; ILP32E-NEXT: fsw fs11, 124(a1) +; ILP32E-NEXT: fsw fs10, 120(a1) +; ILP32E-NEXT: fsw fs9, 116(a1) +; ILP32E-NEXT: fsw fs8, 112(a1) +; ILP32E-NEXT: fsw fs7, 108(a1) +; ILP32E-NEXT: fsw fs6, 104(a1) +; ILP32E-NEXT: fsw fs5, 100(a1) +; ILP32E-NEXT: fsw fs4, 96(a1) ; ILP32E-NEXT: fsw fs3, 92(a1) ; ILP32E-NEXT: fsw fs2, 88(a1) ; ILP32E-NEXT: fsw fs1, 84(a1) @@ -193,22 +193,22 @@ define void @callee() nounwind { ; LP64-NEXT: flw fs1, 84(a1) ; LP64-NEXT: flw fs2, 88(a1) ; LP64-NEXT: flw fs3, 92(a1) -; LP64-NEXT: flw fs4, 112(a1) -; LP64-NEXT: flw fs5, 116(a1) -; LP64-NEXT: flw fs6, 120(a1) -; LP64-NEXT: flw fs7, 124(a1) -; LP64-NEXT: flw fs8, 96(a1) -; LP64-NEXT: flw fs9, 100(a1) -; LP64-NEXT: flw fs10, 104(a1) -; LP64-NEXT: flw fs11, 108(a1) -; LP64-NEXT: fsw fs7, 124(a1) -; LP64-NEXT: fsw fs6, 120(a1) -; LP64-NEXT: fsw fs5, 116(a1) -; LP64-NEXT: fsw fs4, 112(a1) -; LP64-NEXT: fsw fs11, 108(a1) -; LP64-NEXT: fsw fs10, 104(a1) -; LP64-NEXT: fsw fs9, 100(a1) -; LP64-NEXT: fsw fs8, 96(a1) +; LP64-NEXT: flw fs4, 96(a1) +; LP64-NEXT: flw fs5, 100(a1) +; LP64-NEXT: flw fs6, 104(a1) +; LP64-NEXT: flw fs7, 108(a1) +; LP64-NEXT: flw fs8, 112(a1) +; LP64-NEXT: flw fs9, 116(a1) +; LP64-NEXT: flw fs10, 120(a1) +; LP64-NEXT: flw fs11, 124(a1) +; LP64-NEXT: fsw fs11, 124(a1) +; LP64-NEXT: fsw fs10, 120(a1) +; LP64-NEXT: fsw fs9, 116(a1) +; LP64-NEXT: fsw fs8, 112(a1) +; LP64-NEXT: fsw fs7, 108(a1) +; LP64-NEXT: fsw fs6, 104(a1) +; LP64-NEXT: fsw fs5, 100(a1) +; LP64-NEXT: fsw fs4, 96(a1) ; LP64-NEXT: fsw fs3, 92(a1) ; LP64-NEXT: fsw fs2, 88(a1) ; LP64-NEXT: fsw fs1, 84(a1) @@ -263,22 +263,22 @@ define void @callee() nounwind { ; LP64E-NEXT: flw fs1, 84(a1) ; LP64E-NEXT: flw fs2, 88(a1) ; LP64E-NEXT: flw fs3, 92(a1) -; LP64E-NEXT: flw fs4, 112(a1) -; LP64E-NEXT: flw fs5, 116(a1) -; LP64E-NEXT: flw fs6, 120(a1) -; LP64E-NEXT: flw fs7, 124(a1) -; LP64E-NEXT: flw fs8, 96(a1) -; LP64E-NEXT: flw fs9, 100(a1) -; LP64E-NEXT: flw fs10, 104(a1) -; LP64E-NEXT: flw fs11, 108(a1) -; LP64E-NEXT: fsw fs7, 124(a1) -; LP64E-NEXT: fsw fs6, 120(a1) -; LP64E-NEXT: fsw fs5, 116(a1) -; LP64E-NEXT: fsw fs4, 112(a1) -; LP64E-NEXT: fsw fs11, 108(a1) -; LP64E-NEXT: fsw fs10, 104(a1) -; LP64E-NEXT: fsw fs9, 100(a1) -; LP64E-NEXT: fsw fs8, 96(a1) +; LP64E-NEXT: flw fs4, 96(a1) +; LP64E-NEXT: flw fs5, 100(a1) +; LP64E-NEXT: flw fs6, 104(a1) +; LP64E-NEXT: flw fs7, 108(a1) +; LP64E-NEXT: flw fs8, 112(a1) +; LP64E-NEXT: flw fs9, 116(a1) +; LP64E-NEXT: flw fs10, 120(a1) +; LP64E-NEXT: flw fs11, 124(a1) +; LP64E-NEXT: fsw fs11, 124(a1) +; LP64E-NEXT: fsw fs10, 120(a1) +; LP64E-NEXT: fsw fs9, 116(a1) +; LP64E-NEXT: fsw fs8, 112(a1) +; LP64E-NEXT: fsw fs7, 108(a1) +; LP64E-NEXT: fsw fs6, 104(a1) +; LP64E-NEXT: fsw fs5, 100(a1) +; LP64E-NEXT: fsw fs4, 96(a1) ; LP64E-NEXT: fsw fs3, 92(a1) ; LP64E-NEXT: fsw fs2, 88(a1) ; LP64E-NEXT: fsw fs1, 84(a1) @@ -346,22 +346,22 @@ define void @callee() nounwind { ; ILP32F-NEXT: flw fs1, 84(a1) ; ILP32F-NEXT: flw fs2, 88(a1) ; ILP32F-NEXT: flw fs3, 92(a1) -; ILP32F-NEXT: flw fs4, 112(a1) -; ILP32F-NEXT: flw fs5, 116(a1) -; ILP32F-NEXT: flw fs6, 120(a1) -; ILP32F-NEXT: flw fs7, 124(a1) -; ILP32F-NEXT: flw fs8, 96(a1) -; ILP32F-NEXT: flw fs9, 100(a1) -; ILP32F-NEXT: flw fs10, 104(a1) -; ILP32F-NEXT: flw fs11, 108(a1) -; ILP32F-NEXT: fsw fs7, 124(a1) -; ILP32F-NEXT: fsw fs6, 120(a1) -; ILP32F-NEXT: fsw fs5, 116(a1) -; ILP32F-NEXT: fsw fs4, 112(a1) -; ILP32F-NEXT: fsw fs11, 108(a1) -; ILP32F-NEXT: fsw fs10, 104(a1) -; ILP32F-NEXT: fsw fs9, 100(a1) -; ILP32F-NEXT: fsw fs8, 96(a1) +; ILP32F-NEXT: flw fs4, 96(a1) +; ILP32F-NEXT: flw fs5, 100(a1) +; ILP32F-NEXT: flw fs6, 104(a1) +; ILP32F-NEXT: flw fs7, 108(a1) +; ILP32F-NEXT: flw fs8, 112(a1) +; ILP32F-NEXT: flw fs9, 116(a1) +; ILP32F-NEXT: flw fs10, 120(a1) +; ILP32F-NEXT: flw fs11, 124(a1) +; ILP32F-NEXT: fsw fs11, 124(a1) +; ILP32F-NEXT: fsw fs10, 120(a1) +; ILP32F-NEXT: fsw fs9, 116(a1) +; ILP32F-NEXT: fsw fs8, 112(a1) +; ILP32F-NEXT: fsw fs7, 108(a1) +; ILP32F-NEXT: fsw fs6, 104(a1) +; ILP32F-NEXT: fsw fs5, 100(a1) +; ILP32F-NEXT: fsw fs4, 96(a1) ; ILP32F-NEXT: fsw fs3, 92(a1) ; ILP32F-NEXT: fsw fs2, 88(a1) ; ILP32F-NEXT: fsw fs1, 84(a1) @@ -442,22 +442,22 @@ define void @callee() nounwind { ; LP64F-NEXT: flw fs1, 84(a1) ; LP64F-NEXT: flw fs2, 88(a1) ; LP64F-NEXT: flw fs3, 92(a1) -; LP64F-NEXT: flw fs4, 112(a1) -; LP64F-NEXT: flw fs5, 116(a1) -; LP64F-NEXT: flw fs6, 120(a1) -; LP64F-NEXT: flw fs7, 124(a1) -; LP64F-NEXT: flw fs8, 96(a1) -; LP64F-NEXT: flw fs9, 100(a1) -; LP64F-NEXT: flw fs10, 104(a1) -; LP64F-NEXT: flw fs11, 108(a1) -; LP64F-NEXT: fsw fs7, 124(a1) -; LP64F-NEXT: fsw fs6, 120(a1) -; LP64F-NEXT: fsw fs5, 116(a1) -; LP64F-NEXT: fsw fs4, 112(a1) -; LP64F-NEXT: fsw fs11, 108(a1) -; LP64F-NEXT: fsw fs10, 104(a1) -; LP64F-NEXT: fsw fs9, 100(a1) -; LP64F-NEXT: fsw fs8, 96(a1) +; LP64F-NEXT: flw fs4, 96(a1) +; LP64F-NEXT: flw fs5, 100(a1) +; LP64F-NEXT: flw fs6, 104(a1) +; LP64F-NEXT: flw fs7, 108(a1) +; LP64F-NEXT: flw fs8, 112(a1) +; LP64F-NEXT: flw fs9, 116(a1) +; LP64F-NEXT: flw fs10, 120(a1) +; LP64F-NEXT: flw fs11, 124(a1) +; LP64F-NEXT: fsw fs11, 124(a1) +; LP64F-NEXT: fsw fs10, 120(a1) +; LP64F-NEXT: fsw fs9, 116(a1) +; LP64F-NEXT: fsw fs8, 112(a1) +; LP64F-NEXT: fsw fs7, 108(a1) +; LP64F-NEXT: fsw fs6, 104(a1) +; LP64F-NEXT: fsw fs5, 100(a1) +; LP64F-NEXT: fsw fs4, 96(a1) ; LP64F-NEXT: fsw fs3, 92(a1) ; LP64F-NEXT: fsw fs2, 88(a1) ; LP64F-NEXT: fsw fs1, 84(a1) @@ -538,22 +538,22 @@ define void @callee() nounwind { ; ILP32D-NEXT: flw fs1, 84(a1) ; ILP32D-NEXT: flw fs2, 88(a1) ; ILP32D-NEXT: flw fs3, 92(a1) -; ILP32D-NEXT: flw fs4, 112(a1) -; ILP32D-NEXT: flw fs5, 116(a1) -; ILP32D-NEXT: flw fs6, 120(a1) -; ILP32D-NEXT: flw fs7, 124(a1) -; ILP32D-NEXT: flw fs8, 96(a1) -; ILP32D-NEXT: flw fs9, 100(a1) -; ILP32D-NEXT: flw fs10, 104(a1) -; ILP32D-NEXT: flw fs11, 108(a1) -; ILP32D-NEXT: fsw fs7, 124(a1) -; ILP32D-NEXT: fsw fs6, 120(a1) -; ILP32D-NEXT: fsw fs5, 116(a1) -; ILP32D-NEXT: fsw fs4, 112(a1) -; ILP32D-NEXT: fsw fs11, 108(a1) -; ILP32D-NEXT: fsw fs10, 104(a1) -; ILP32D-NEXT: fsw fs9, 100(a1) -; ILP32D-NEXT: fsw fs8, 96(a1) +; ILP32D-NEXT: flw fs4, 96(a1) +; ILP32D-NEXT: flw fs5, 100(a1) +; ILP32D-NEXT: flw fs6, 104(a1) +; ILP32D-NEXT: flw fs7, 108(a1) +; ILP32D-NEXT: flw fs8, 112(a1) +; ILP32D-NEXT: flw fs9, 116(a1) +; ILP32D-NEXT: flw fs10, 120(a1) +; ILP32D-NEXT: flw fs11, 124(a1) +; ILP32D-NEXT: fsw fs11, 124(a1) +; ILP32D-NEXT: fsw fs10, 120(a1) +; ILP32D-NEXT: fsw fs9, 116(a1) +; ILP32D-NEXT: fsw fs8, 112(a1) +; ILP32D-NEXT: fsw fs7, 108(a1) +; ILP32D-NEXT: fsw fs6, 104(a1) +; ILP32D-NEXT: fsw fs5, 100(a1) +; ILP32D-NEXT: fsw fs4, 96(a1) ; ILP32D-NEXT: fsw fs3, 92(a1) ; ILP32D-NEXT: fsw fs2, 88(a1) ; ILP32D-NEXT: fsw fs1, 84(a1) @@ -634,22 +634,22 @@ define void @callee() nounwind { ; LP64D-NEXT: flw fs1, 84(a1) ; LP64D-NEXT: flw fs2, 88(a1) ; LP64D-NEXT: flw fs3, 92(a1) -; LP64D-NEXT: flw fs4, 112(a1) -; LP64D-NEXT: flw fs5, 116(a1) -; LP64D-NEXT: flw fs6, 120(a1) -; LP64D-NEXT: flw fs7, 124(a1) -; LP64D-NEXT: flw fs8, 96(a1) -; LP64D-NEXT: flw fs9, 100(a1) -; LP64D-NEXT: flw fs10, 104(a1) -; LP64D-NEXT: flw fs11, 108(a1) -; LP64D-NEXT: fsw fs7, 124(a1) -; LP64D-NEXT: fsw fs6, 120(a1) -; LP64D-NEXT: fsw fs5, 116(a1) -; LP64D-NEXT: fsw fs4, 112(a1) -; LP64D-NEXT: fsw fs11, 108(a1) -; LP64D-NEXT: fsw fs10, 104(a1) -; LP64D-NEXT: fsw fs9, 100(a1) -; LP64D-NEXT: fsw fs8, 96(a1) +; LP64D-NEXT: flw fs4, 96(a1) +; LP64D-NEXT: flw fs5, 100(a1) +; LP64D-NEXT: flw fs6, 104(a1) +; LP64D-NEXT: flw fs7, 108(a1) +; LP64D-NEXT: flw fs8, 112(a1) +; LP64D-NEXT: flw fs9, 116(a1) +; LP64D-NEXT: flw fs10, 120(a1) +; LP64D-NEXT: flw fs11, 124(a1) +; LP64D-NEXT: fsw fs11, 124(a1) +; LP64D-NEXT: fsw fs10, 120(a1) +; LP64D-NEXT: fsw fs9, 116(a1) +; LP64D-NEXT: fsw fs8, 112(a1) +; LP64D-NEXT: fsw fs7, 108(a1) +; LP64D-NEXT: fsw fs6, 104(a1) +; LP64D-NEXT: fsw fs5, 100(a1) +; LP64D-NEXT: fsw fs4, 96(a1) ; LP64D-NEXT: fsw fs3, 92(a1) ; LP64D-NEXT: fsw fs2, 88(a1) ; LP64D-NEXT: fsw fs1, 84(a1) diff --git a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll index 0501c700f57df..8a97e77bea55d 100644 --- a/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll +++ b/llvm/test/CodeGen/RISCV/callee-saved-fpr64s.ll @@ -45,26 +45,26 @@ define void @callee() nounwind { ; ILP32-NEXT: fld ft11, 152(a1) ; ILP32-NEXT: fld fs0, 160(a1) ; ILP32-NEXT: fld fs1, 168(a1) -; ILP32-NEXT: fld fs2, 208(a1) -; ILP32-NEXT: fld fs3, 216(a1) -; ILP32-NEXT: fld fs4, 224(a1) -; ILP32-NEXT: fld fs5, 232(a1) -; ILP32-NEXT: fld fs6, 240(a1) -; ILP32-NEXT: fld fs7, 248(a1) -; ILP32-NEXT: fld fs8, 176(a1) -; ILP32-NEXT: fld fs9, 184(a1) -; ILP32-NEXT: fld fs10, 192(a1) -; ILP32-NEXT: fld fs11, 200(a1) -; ILP32-NEXT: fsd fs7, 248(a1) -; ILP32-NEXT: fsd fs6, 240(a1) -; ILP32-NEXT: fsd fs5, 232(a1) -; ILP32-NEXT: fsd fs4, 224(a1) -; ILP32-NEXT: fsd fs3, 216(a1) -; ILP32-NEXT: fsd fs2, 208(a1) -; ILP32-NEXT: fsd fs11, 200(a1) -; ILP32-NEXT: fsd fs10, 192(a1) -; ILP32-NEXT: fsd fs9, 184(a1) -; ILP32-NEXT: fsd fs8, 176(a1) +; ILP32-NEXT: fld fs2, 176(a1) +; ILP32-NEXT: fld fs3, 184(a1) +; ILP32-NEXT: fld fs4, 192(a1) +; ILP32-NEXT: fld fs5, 200(a1) +; ILP32-NEXT: fld fs6, 208(a1) +; ILP32-NEXT: fld fs7, 216(a1) +; ILP32-NEXT: fld fs8, 224(a1) +; ILP32-NEXT: fld fs9, 232(a1) +; ILP32-NEXT: fld fs10, 240(a1) +; ILP32-NEXT: fld fs11, 248(a1) +; ILP32-NEXT: fsd fs11, 248(a1) +; ILP32-NEXT: fsd fs10, 240(a1) +; ILP32-NEXT: fsd fs9, 232(a1) +; ILP32-NEXT: fsd fs8, 224(a1) +; ILP32-NEXT: fsd fs7, 216(a1) +; ILP32-NEXT: fsd fs6, 208(a1) +; ILP32-NEXT: fsd fs5, 200(a1) +; ILP32-NEXT: fsd fs4, 192(a1) +; ILP32-NEXT: fsd fs3, 184(a1) +; ILP32-NEXT: fsd fs2, 176(a1) ; ILP32-NEXT: fsd fs1, 168(a1) ; ILP32-NEXT: fsd fs0, 160(a1) ; ILP32-NEXT: fsd ft11, 152(a1) @@ -115,26 +115,26 @@ define void @callee() nounwind { ; LP64-NEXT: fld ft11, 152(a1) ; LP64-NEXT: fld fs0, 160(a1) ; LP64-NEXT: fld fs1, 168(a1) -; LP64-NEXT: fld fs2, 208(a1) -; LP64-NEXT: fld fs3, 216(a1) -; LP64-NEXT: fld fs4, 224(a1) -; LP64-NEXT: fld fs5, 232(a1) -; LP64-NEXT: fld fs6, 240(a1) -; LP64-NEXT: fld fs7, 248(a1) -; LP64-NEXT: fld fs8, 176(a1) -; LP64-NEXT: fld fs9, 184(a1) -; LP64-NEXT: fld fs10, 192(a1) -; LP64-NEXT: fld fs11, 200(a1) -; LP64-NEXT: fsd fs7, 248(a1) -; LP64-NEXT: fsd fs6, 240(a1) -; LP64-NEXT: fsd fs5, 232(a1) -; LP64-NEXT: fsd fs4, 224(a1) -; LP64-NEXT: fsd fs3, 216(a1) -; LP64-NEXT: fsd fs2, 208(a1) -; LP64-NEXT: fsd fs11, 200(a1) -; LP64-NEXT: fsd fs10, 192(a1) -; LP64-NEXT: fsd fs9, 184(a1) -; LP64-NEXT: fsd fs8, 176(a1) +; LP64-NEXT: fld fs2, 176(a1) +; LP64-NEXT: fld fs3, 184(a1) +; LP64-NEXT: fld fs4, 192(a1) +; LP64-NEXT: fld fs5, 200(a1) +; LP64-NEXT: fld fs6, 208(a1) +; LP64-NEXT: fld fs7, 216(a1) +; LP64-NEXT: fld fs8, 224(a1) +; LP64-NEXT: fld fs9, 232(a1) +; LP64-NEXT: fld fs10, 240(a1) +; LP64-NEXT: fld fs11, 248(a1) +; LP64-NEXT: fsd fs11, 248(a1) +; LP64-NEXT: fsd fs10, 240(a1) +; LP64-NEXT: fsd fs9, 232(a1) +; LP64-NEXT: fsd fs8, 224(a1) +; LP64-NEXT: fsd fs7, 216(a1) +; LP64-NEXT: fsd fs6, 208(a1) +; LP64-NEXT: fsd fs5, 200(a1) +; LP64-NEXT: fsd fs4, 192(a1) +; LP64-NEXT: fsd fs3, 184(a1) +; LP64-NEXT: fsd fs2, 176(a1) ; LP64-NEXT: fsd fs1, 168(a1) ; LP64-NEXT: fsd fs0, 160(a1) ; LP64-NEXT: fsd ft11, 152(a1) @@ -185,26 +185,26 @@ define void @callee() nounwind { ; LP64E-NEXT: fld ft11, 152(a1) ; LP64E-NEXT: fld fs0, 160(a1) ; LP64E-NEXT: fld fs1, 168(a1) -; LP64E-NEXT: fld fs2, 208(a1) -; LP64E-NEXT: fld fs3, 216(a1) -; LP64E-NEXT: fld fs4, 224(a1) -; LP64E-NEXT: fld fs5, 232(a1) -; LP64E-NEXT: fld fs6, 240(a1) -; LP64E-NEXT: fld fs7, 248(a1) -; LP64E-NEXT: fld fs8, 176(a1) -; LP64E-NEXT: fld fs9, 184(a1) -; LP64E-NEXT: fld fs10, 192(a1) -; LP64E-NEXT: fld fs11, 200(a1) -; LP64E-NEXT: fsd fs7, 248(a1) -; LP64E-NEXT: fsd fs6, 240(a1) -; LP64E-NEXT: fsd fs5, 232(a1) -; LP64E-NEXT: fsd fs4, 224(a1) -; LP64E-NEXT: fsd fs3, 216(a1) -; LP64E-NEXT: fsd fs2, 208(a1) -; LP64E-NEXT: fsd fs11, 200(a1) -; LP64E-NEXT: fsd fs10, 192(a1) -; LP64E-NEXT: fsd fs9, 184(a1) -; LP64E-NEXT: fsd fs8, 176(a1) +; LP64E-NEXT: fld fs2, 176(a1) +; LP64E-NEXT: fld fs3, 184(a1) +; LP64E-NEXT: fld fs4, 192(a1) +; LP64E-NEXT: fld fs5, 200(a1) +; LP64E-NEXT: fld fs6, 208(a1) +; LP64E-NEXT: fld fs7, 216(a1) +; LP64E-NEXT: fld fs8, 224(a1) +; LP64E-NEXT: fld fs9, 232(a1) +; LP64E-NEXT: fld fs10, 240(a1) +; LP64E-NEXT: fld fs11, 248(a1) +; LP64E-NEXT: fsd fs11, 248(a1) +; LP64E-NEXT: fsd fs10, 240(a1) +; LP64E-NEXT: fsd fs9, 232(a1) +; LP64E-NEXT: fsd fs8, 224(a1) +; LP64E-NEXT: fsd fs7, 216(a1) +; LP64E-NEXT: fsd fs6, 208(a1) +; LP64E-NEXT: fsd fs5, 200(a1) +; LP64E-NEXT: fsd fs4, 192(a1) +; LP64E-NEXT: fsd fs3, 184(a1) +; LP64E-NEXT: fsd fs2, 176(a1) ; LP64E-NEXT: fsd fs1, 168(a1) ; LP64E-NEXT: fsd fs0, 160(a1) ; LP64E-NEXT: fsd ft11, 152(a1) @@ -268,26 +268,26 @@ define void @callee() nounwind { ; ILP32D-NEXT: fld ft11, 152(a1) ; ILP32D-NEXT: fld fs0, 160(a1) ; ILP32D-NEXT: fld fs1, 168(a1) -; ILP32D-NEXT: fld fs2, 208(a1) -; ILP32D-NEXT: fld fs3, 216(a1) -; ILP32D-NEXT: fld fs4, 224(a1) -; ILP32D-NEXT: fld fs5, 232(a1) -; ILP32D-NEXT: fld fs6, 240(a1) -; ILP32D-NEXT: fld fs7, 248(a1) -; ILP32D-NEXT: fld fs8, 176(a1) -; ILP32D-NEXT: fld fs9, 184(a1) -; ILP32D-NEXT: fld fs10, 192(a1) -; ILP32D-NEXT: fld fs11, 200(a1) -; ILP32D-NEXT: fsd fs7, 248(a1) -; ILP32D-NEXT: fsd fs6, 240(a1) -; ILP32D-NEXT: fsd fs5, 232(a1) -; ILP32D-NEXT: fsd fs4, 224(a1) -; ILP32D-NEXT: fsd fs3, 216(a1) -; ILP32D-NEXT: fsd fs2, 208(a1) -; ILP32D-NEXT: fsd fs11, 200(a1) -; ILP32D-NEXT: fsd fs10, 192(a1) -; ILP32D-NEXT: fsd fs9, 184(a1) -; ILP32D-NEXT: fsd fs8, 176(a1) +; ILP32D-NEXT: fld fs2, 176(a1) +; ILP32D-NEXT: fld fs3, 184(a1) +; ILP32D-NEXT: fld fs4, 192(a1) +; ILP32D-NEXT: fld fs5, 200(a1) +; ILP32D-NEXT: fld fs6, 208(a1) +; ILP32D-NEXT: fld fs7, 216(a1) +; ILP32D-NEXT: fld fs8, 224(a1) +; ILP32D-NEXT: fld fs9, 232(a1) +; ILP32D-NEXT: fld fs10, 240(a1) +; ILP32D-NEXT: fld fs11, 248(a1) +; ILP32D-NEXT: fsd fs11, 248(a1) +; ILP32D-NEXT: fsd fs10, 240(a1) +; ILP32D-NEXT: fsd fs9, 232(a1) +; ILP32D-NEXT: fsd fs8, 224(a1) +; ILP32D-NEXT: fsd fs7, 216(a1) +; ILP32D-NEXT: fsd fs6, 208(a1) +; ILP32D-NEXT: fsd fs5, 200(a1) +; ILP32D-NEXT: fsd fs4, 192(a1) +; ILP32D-NEXT: fsd fs3, 184(a1) +; ILP32D-NEXT: fsd fs2, 176(a1) ; ILP32D-NEXT: fsd fs1, 168(a1) ; ILP32D-NEXT: fsd fs0, 160(a1) ; ILP32D-NEXT: fsd ft11, 152(a1) @@ -364,26 +364,26 @@ define void @callee() nounwind { ; LP64D-NEXT: fld ft11, 152(a1) ; LP64D-NEXT: fld fs0, 160(a1) ; LP64D-NEXT: fld fs1, 168(a1) -; LP64D-NEXT: fld fs2, 208(a1) -; LP64D-NEXT: fld fs3, 216(a1) -; LP64D-NEXT: fld fs4, 224(a1) -; LP64D-NEXT: fld fs5, 232(a1) -; LP64D-NEXT: fld fs6, 240(a1) -; LP64D-NEXT: fld fs7, 248(a1) -; LP64D-NEXT: fld fs8, 176(a1) -; LP64D-NEXT: fld fs9, 184(a1) -; LP64D-NEXT: fld fs10, 192(a1) -; LP64D-NEXT: fld fs11, 200(a1) -; LP64D-NEXT: fsd fs7, 248(a1) -; LP64D-NEXT: fsd fs6, 240(a1) -; LP64D-NEXT: fsd fs5, 232(a1) -; LP64D-NEXT: fsd fs4, 224(a1) -; LP64D-NEXT: fsd fs3, 216(a1) -; LP64D-NEXT: fsd fs2, 208(a1) -; LP64D-NEXT: fsd fs11, 200(a1) -; LP64D-NEXT: fsd fs10, 192(a1) -; LP64D-NEXT: fsd fs9, 184(a1) -; LP64D-NEXT: fsd fs8, 176(a1) +; LP64D-NEXT: fld fs2, 176(a1) +; LP64D-NEXT: fld fs3, 184(a1) +; LP64D-NEXT: fld fs4, 192(a1) +; LP64D-NEXT: fld fs5, 200(a1) +; LP64D-NEXT: fld fs6, 208(a1) +; LP64D-NEXT: fld fs7, 216(a1) +; LP64D-NEXT: fld fs8, 224(a1) +; LP64D-NEXT: fld fs9, 232(a1) +; LP64D-NEXT: fld fs10, 240(a1) +; LP64D-NEXT: fld fs11, 248(a1) +; LP64D-NEXT: fsd fs11, 248(a1) +; LP64D-NEXT: fsd fs10, 240(a1) +; LP64D-NEXT: fsd fs9, 232(a1) +; LP64D-NEXT: fsd fs8, 224(a1) +; LP64D-NEXT: fsd fs7, 216(a1) +; LP64D-NEXT: fsd fs6, 208(a1) +; LP64D-NEXT: fsd fs5, 200(a1) +; LP64D-NEXT: fsd fs4, 192(a1) +; LP64D-NEXT: fsd fs3, 184(a1) +; LP64D-NEXT: fsd fs2, 176(a1) ; LP64D-NEXT: fsd fs1, 168(a1) ; LP64D-NEXT: fsd fs0, 160(a1) ; LP64D-NEXT: fsd ft11, 152(a1) diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll index f9f1ba60a8ac0..53a4b1bafaab6 100644 --- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll +++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll @@ -68,16 +68,16 @@ define void @callee() { ; RV32I-NEXT: .cfi_offset s9, -44 ; RV32I-NEXT: .cfi_offset s10, -48 ; RV32I-NEXT: .cfi_offset s11, -52 -; RV32I-NEXT: lui a7, %hi(var) -; RV32I-NEXT: lw a0, %lo(var)(a7) +; RV32I-NEXT: lui a4, %hi(var) +; RV32I-NEXT: lw a0, %lo(var)(a4) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+4)(a7) +; RV32I-NEXT: lw a0, %lo(var+4)(a4) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+8)(a7) +; RV32I-NEXT: lw a0, %lo(var+8)(a4) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+12)(a7) +; RV32I-NEXT: lw a0, %lo(var+12)(a4) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a7, %lo(var) +; RV32I-NEXT: addi a5, a4, %lo(var) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -100,22 +100,22 @@ define void @callee() { ; RV32I-NEXT: lw s8, 84(a5) ; RV32I-NEXT: lw s9, 88(a5) ; RV32I-NEXT: lw s10, 92(a5) -; RV32I-NEXT: lw s11, 112(a5) -; RV32I-NEXT: lw ra, 116(a5) -; RV32I-NEXT: lw a3, 120(a5) -; RV32I-NEXT: lw a0, 124(a5) -; RV32I-NEXT: lw a6, 96(a5) -; RV32I-NEXT: lw a4, 100(a5) -; RV32I-NEXT: lw a2, 104(a5) -; RV32I-NEXT: lw a1, 108(a5) -; RV32I-NEXT: sw a0, 124(a5) -; RV32I-NEXT: sw a3, 120(a5) -; RV32I-NEXT: sw ra, 116(a5) -; RV32I-NEXT: sw s11, 112(a5) -; RV32I-NEXT: sw a1, 108(a5) -; RV32I-NEXT: sw a2, 104(a5) -; RV32I-NEXT: sw a4, 100(a5) -; RV32I-NEXT: sw a6, 96(a5) +; RV32I-NEXT: lw s11, 96(a5) +; RV32I-NEXT: lw ra, 100(a5) +; RV32I-NEXT: lw a6, 104(a5) +; RV32I-NEXT: lw a3, 108(a5) +; RV32I-NEXT: lw a2, 112(a5) +; RV32I-NEXT: lw a1, 116(a5) +; RV32I-NEXT: lw a0, 120(a5) +; RV32I-NEXT: lw a7, 124(a5) +; RV32I-NEXT: sw a7, 124(a5) +; RV32I-NEXT: sw a0, 120(a5) +; RV32I-NEXT: sw a1, 116(a5) +; RV32I-NEXT: sw a2, 112(a5) +; RV32I-NEXT: sw a3, 108(a5) +; RV32I-NEXT: sw a6, 104(a5) +; RV32I-NEXT: sw ra, 100(a5) +; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) ; RV32I-NEXT: sw s9, 88(a5) ; RV32I-NEXT: sw s8, 84(a5) @@ -139,13 +139,13 @@ define void @callee() { ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+12)(a7) +; RV32I-NEXT: sw a0, %lo(var+12)(a4) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+8)(a7) +; RV32I-NEXT: sw a0, %lo(var+8)(a4) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+4)(a7) +; RV32I-NEXT: sw a0, %lo(var+4)(a4) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var)(a7) +; RV32I-NEXT: sw a0, %lo(var)(a4) ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -186,16 +186,16 @@ define void @callee() { ; RV32I-ILP32E-NEXT: .cfi_offset ra, -4 ; RV32I-ILP32E-NEXT: .cfi_offset s0, -8 ; RV32I-ILP32E-NEXT: .cfi_offset s1, -12 -; RV32I-ILP32E-NEXT: lui a7, %hi(var) -; RV32I-ILP32E-NEXT: lw a0, %lo(var)(a7) +; RV32I-ILP32E-NEXT: lui a4, %hi(var) +; RV32I-ILP32E-NEXT: lw a0, %lo(var)(a4) ; RV32I-ILP32E-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-ILP32E-NEXT: lw a0, %lo(var+4)(a7) +; RV32I-ILP32E-NEXT: lw a0, %lo(var+4)(a4) ; RV32I-ILP32E-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-ILP32E-NEXT: lw a0, %lo(var+8)(a7) +; RV32I-ILP32E-NEXT: lw a0, %lo(var+8)(a4) ; RV32I-ILP32E-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-ILP32E-NEXT: lw a0, %lo(var+12)(a7) +; RV32I-ILP32E-NEXT: lw a0, %lo(var+12)(a4) ; RV32I-ILP32E-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-ILP32E-NEXT: addi a5, a7, %lo(var) +; RV32I-ILP32E-NEXT: addi a5, a4, %lo(var) ; RV32I-ILP32E-NEXT: lw a0, 16(a5) ; RV32I-ILP32E-NEXT: sw a0, 4(sp) # 4-byte Folded Spill ; RV32I-ILP32E-NEXT: lw a0, 20(a5) @@ -218,22 +218,22 @@ define void @callee() { ; RV32I-ILP32E-NEXT: lw s10, 84(a5) ; RV32I-ILP32E-NEXT: lw s11, 88(a5) ; RV32I-ILP32E-NEXT: lw s0, 92(a5) -; RV32I-ILP32E-NEXT: lw s1, 112(a5) -; RV32I-ILP32E-NEXT: lw ra, 116(a5) -; RV32I-ILP32E-NEXT: lw a3, 120(a5) -; RV32I-ILP32E-NEXT: lw a0, 124(a5) -; RV32I-ILP32E-NEXT: lw a6, 96(a5) -; RV32I-ILP32E-NEXT: lw a4, 100(a5) -; RV32I-ILP32E-NEXT: lw a2, 104(a5) -; RV32I-ILP32E-NEXT: lw a1, 108(a5) -; RV32I-ILP32E-NEXT: sw a0, 124(a5) -; RV32I-ILP32E-NEXT: sw a3, 120(a5) -; RV32I-ILP32E-NEXT: sw ra, 116(a5) -; RV32I-ILP32E-NEXT: sw s1, 112(a5) -; RV32I-ILP32E-NEXT: sw a1, 108(a5) -; RV32I-ILP32E-NEXT: sw a2, 104(a5) -; RV32I-ILP32E-NEXT: sw a4, 100(a5) -; RV32I-ILP32E-NEXT: sw a6, 96(a5) +; RV32I-ILP32E-NEXT: lw s1, 96(a5) +; RV32I-ILP32E-NEXT: lw ra, 100(a5) +; RV32I-ILP32E-NEXT: lw a6, 104(a5) +; RV32I-ILP32E-NEXT: lw a3, 108(a5) +; RV32I-ILP32E-NEXT: lw a2, 112(a5) +; RV32I-ILP32E-NEXT: lw a1, 116(a5) +; RV32I-ILP32E-NEXT: lw a0, 120(a5) +; RV32I-ILP32E-NEXT: lw a7, 124(a5) +; RV32I-ILP32E-NEXT: sw a7, 124(a5) +; RV32I-ILP32E-NEXT: sw a0, 120(a5) +; RV32I-ILP32E-NEXT: sw a1, 116(a5) +; RV32I-ILP32E-NEXT: sw a2, 112(a5) +; RV32I-ILP32E-NEXT: sw a3, 108(a5) +; RV32I-ILP32E-NEXT: sw a6, 104(a5) +; RV32I-ILP32E-NEXT: sw ra, 100(a5) +; RV32I-ILP32E-NEXT: sw s1, 96(a5) ; RV32I-ILP32E-NEXT: sw s0, 92(a5) ; RV32I-ILP32E-NEXT: sw s11, 88(a5) ; RV32I-ILP32E-NEXT: sw s10, 84(a5) @@ -257,13 +257,13 @@ define void @callee() { ; RV32I-ILP32E-NEXT: lw a0, 4(sp) # 4-byte Folded Reload ; RV32I-ILP32E-NEXT: sw a0, 16(a5) ; RV32I-ILP32E-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-ILP32E-NEXT: sw a0, %lo(var+12)(a7) +; RV32I-ILP32E-NEXT: sw a0, %lo(var+12)(a4) ; RV32I-ILP32E-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-ILP32E-NEXT: sw a0, %lo(var+8)(a7) +; RV32I-ILP32E-NEXT: sw a0, %lo(var+8)(a4) ; RV32I-ILP32E-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-ILP32E-NEXT: sw a0, %lo(var+4)(a7) +; RV32I-ILP32E-NEXT: sw a0, %lo(var+4)(a4) ; RV32I-ILP32E-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-ILP32E-NEXT: sw a0, %lo(var)(a7) +; RV32I-ILP32E-NEXT: sw a0, %lo(var)(a4) ; RV32I-ILP32E-NEXT: lw ra, 32(sp) # 4-byte Folded Reload ; RV32I-ILP32E-NEXT: lw s0, 28(sp) # 4-byte Folded Reload ; RV32I-ILP32E-NEXT: lw s1, 24(sp) # 4-byte Folded Reload @@ -306,16 +306,16 @@ define void @callee() { ; RV32I-WITH-FP-NEXT: .cfi_offset s11, -52 ; RV32I-WITH-FP-NEXT: addi s0, sp, 80 ; RV32I-WITH-FP-NEXT: .cfi_def_cfa s0, 0 -; RV32I-WITH-FP-NEXT: lui t0, %hi(var) -; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(t0) +; RV32I-WITH-FP-NEXT: lui a4, %hi(var) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(a4) ; RV32I-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+4)(t0) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+4)(a4) ; RV32I-WITH-FP-NEXT: sw a0, -60(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+8)(t0) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+8)(a4) ; RV32I-WITH-FP-NEXT: sw a0, -64(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+12)(t0) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+12)(a4) ; RV32I-WITH-FP-NEXT: sw a0, -68(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: addi a5, t0, %lo(var) +; RV32I-WITH-FP-NEXT: addi a5, a4, %lo(var) ; RV32I-WITH-FP-NEXT: lw a0, 16(a5) ; RV32I-WITH-FP-NEXT: sw a0, -72(s0) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: lw a0, 20(a5) @@ -339,22 +339,22 @@ define void @callee() { ; RV32I-WITH-FP-NEXT: lw s9, 84(a5) ; RV32I-WITH-FP-NEXT: lw s10, 88(a5) ; RV32I-WITH-FP-NEXT: lw s11, 92(a5) -; RV32I-WITH-FP-NEXT: lw ra, 112(a5) -; RV32I-WITH-FP-NEXT: lw a4, 116(a5) -; RV32I-WITH-FP-NEXT: lw a3, 120(a5) -; RV32I-WITH-FP-NEXT: lw a0, 124(a5) -; RV32I-WITH-FP-NEXT: lw a7, 96(a5) -; RV32I-WITH-FP-NEXT: lw a6, 100(a5) -; RV32I-WITH-FP-NEXT: lw a2, 104(a5) -; RV32I-WITH-FP-NEXT: lw a1, 108(a5) -; RV32I-WITH-FP-NEXT: sw a0, 124(a5) -; RV32I-WITH-FP-NEXT: sw a3, 120(a5) -; RV32I-WITH-FP-NEXT: sw a4, 116(a5) -; RV32I-WITH-FP-NEXT: sw ra, 112(a5) -; RV32I-WITH-FP-NEXT: sw a1, 108(a5) -; RV32I-WITH-FP-NEXT: sw a2, 104(a5) -; RV32I-WITH-FP-NEXT: sw a6, 100(a5) -; RV32I-WITH-FP-NEXT: sw a7, 96(a5) +; RV32I-WITH-FP-NEXT: lw ra, 96(a5) +; RV32I-WITH-FP-NEXT: lw a7, 100(a5) +; RV32I-WITH-FP-NEXT: lw a6, 104(a5) +; RV32I-WITH-FP-NEXT: lw a3, 108(a5) +; RV32I-WITH-FP-NEXT: lw a2, 112(a5) +; RV32I-WITH-FP-NEXT: lw a1, 116(a5) +; RV32I-WITH-FP-NEXT: lw a0, 120(a5) +; RV32I-WITH-FP-NEXT: lw t0, 124(a5) +; RV32I-WITH-FP-NEXT: sw t0, 124(a5) +; RV32I-WITH-FP-NEXT: sw a0, 120(a5) +; RV32I-WITH-FP-NEXT: sw a1, 116(a5) +; RV32I-WITH-FP-NEXT: sw a2, 112(a5) +; RV32I-WITH-FP-NEXT: sw a3, 108(a5) +; RV32I-WITH-FP-NEXT: sw a6, 104(a5) +; RV32I-WITH-FP-NEXT: sw a7, 100(a5) +; RV32I-WITH-FP-NEXT: sw ra, 96(a5) ; RV32I-WITH-FP-NEXT: sw s11, 92(a5) ; RV32I-WITH-FP-NEXT: sw s10, 88(a5) ; RV32I-WITH-FP-NEXT: sw s9, 84(a5) @@ -379,13 +379,13 @@ define void @callee() { ; RV32I-WITH-FP-NEXT: lw a0, -72(s0) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: sw a0, 16(a5) ; RV32I-WITH-FP-NEXT: lw a0, -68(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+12)(t0) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+12)(a4) ; RV32I-WITH-FP-NEXT: lw a0, -64(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+8)(t0) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+8)(a4) ; RV32I-WITH-FP-NEXT: lw a0, -60(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+4)(t0) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+4)(a4) ; RV32I-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(t0) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(a4) ; RV32I-WITH-FP-NEXT: .cfi_def_cfa sp, 80 ; RV32I-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload @@ -434,16 +434,16 @@ define void @callee() { ; RV32IZCMP-NEXT: .cfi_offset s9, -12 ; RV32IZCMP-NEXT: .cfi_offset s10, -8 ; RV32IZCMP-NEXT: .cfi_offset s11, -4 -; RV32IZCMP-NEXT: lui t0, %hi(var) -; RV32IZCMP-NEXT: lw a0, %lo(var)(t0) +; RV32IZCMP-NEXT: lui a4, %hi(var) +; RV32IZCMP-NEXT: lw a0, %lo(var)(a4) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+4)(t0) +; RV32IZCMP-NEXT: lw a0, %lo(var+4)(a4) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+8)(t0) +; RV32IZCMP-NEXT: lw a0, %lo(var+8)(a4) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+12)(t0) +; RV32IZCMP-NEXT: lw a0, %lo(var+12)(a4) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, t0, %lo(var) +; RV32IZCMP-NEXT: addi a5, a4, %lo(var) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -463,28 +463,28 @@ define void @callee() { ; RV32IZCMP-NEXT: lw s11, 72(a5) ; RV32IZCMP-NEXT: lw ra, 76(a5) ; RV32IZCMP-NEXT: lw s1, 80(a5) -; RV32IZCMP-NEXT: lw t3, 84(a5) -; RV32IZCMP-NEXT: lw t2, 88(a5) -; RV32IZCMP-NEXT: lw t1, 92(a5) -; RV32IZCMP-NEXT: lw a7, 112(a5) -; RV32IZCMP-NEXT: lw s0, 116(a5) -; RV32IZCMP-NEXT: lw a3, 120(a5) -; RV32IZCMP-NEXT: lw a0, 124(a5) -; RV32IZCMP-NEXT: lw a6, 96(a5) -; RV32IZCMP-NEXT: lw a4, 100(a5) -; RV32IZCMP-NEXT: lw a2, 104(a5) -; RV32IZCMP-NEXT: lw a1, 108(a5) -; RV32IZCMP-NEXT: sw a0, 124(a5) -; RV32IZCMP-NEXT: sw a3, 120(a5) -; RV32IZCMP-NEXT: sw s0, 116(a5) -; RV32IZCMP-NEXT: sw a7, 112(a5) -; RV32IZCMP-NEXT: sw a1, 108(a5) -; RV32IZCMP-NEXT: sw a2, 104(a5) -; RV32IZCMP-NEXT: sw a4, 100(a5) -; RV32IZCMP-NEXT: sw a6, 96(a5) -; RV32IZCMP-NEXT: sw t1, 92(a5) -; RV32IZCMP-NEXT: sw t2, 88(a5) -; RV32IZCMP-NEXT: sw t3, 84(a5) +; RV32IZCMP-NEXT: lw t2, 84(a5) +; RV32IZCMP-NEXT: lw t1, 88(a5) +; RV32IZCMP-NEXT: lw t0, 92(a5) +; RV32IZCMP-NEXT: lw a7, 96(a5) +; RV32IZCMP-NEXT: lw s0, 100(a5) +; RV32IZCMP-NEXT: lw a6, 104(a5) +; RV32IZCMP-NEXT: lw a3, 108(a5) +; RV32IZCMP-NEXT: lw a2, 112(a5) +; RV32IZCMP-NEXT: lw a1, 116(a5) +; RV32IZCMP-NEXT: lw a0, 120(a5) +; RV32IZCMP-NEXT: lw t3, 124(a5) +; RV32IZCMP-NEXT: sw t3, 124(a5) +; RV32IZCMP-NEXT: sw a0, 120(a5) +; RV32IZCMP-NEXT: sw a1, 116(a5) +; RV32IZCMP-NEXT: sw a2, 112(a5) +; RV32IZCMP-NEXT: sw a3, 108(a5) +; RV32IZCMP-NEXT: sw a6, 104(a5) +; RV32IZCMP-NEXT: sw s0, 100(a5) +; RV32IZCMP-NEXT: sw a7, 96(a5) +; RV32IZCMP-NEXT: sw t0, 92(a5) +; RV32IZCMP-NEXT: sw t1, 88(a5) +; RV32IZCMP-NEXT: sw t2, 84(a5) ; RV32IZCMP-NEXT: sw s1, 80(a5) ; RV32IZCMP-NEXT: sw ra, 76(a5) ; RV32IZCMP-NEXT: sw s11, 72(a5) @@ -505,13 +505,13 @@ define void @callee() { ; RV32IZCMP-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+12)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var+12)(a4) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+8)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var+8)(a4) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+4)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var+4)(a4) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var)(a4) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 80 ; ; RV32IZCMP-WITH-FP-LABEL: callee: @@ -546,16 +546,16 @@ define void @callee() { ; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s11, -52 ; RV32IZCMP-WITH-FP-NEXT: addi s0, sp, 80 ; RV32IZCMP-WITH-FP-NEXT: .cfi_def_cfa s0, 0 -; RV32IZCMP-WITH-FP-NEXT: lui t1, %hi(var) -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(t1) +; RV32IZCMP-WITH-FP-NEXT: lui a4, %hi(var) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a4) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(t1) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a4) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -60(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(t1) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a4) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -64(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(t1) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a4) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -68(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: addi a5, t1, %lo(var) +; RV32IZCMP-WITH-FP-NEXT: addi a5, a4, %lo(var) ; RV32IZCMP-WITH-FP-NEXT: lw a0, 16(a5) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -72(s0) # 4-byte Folded Spill ; RV32IZCMP-WITH-FP-NEXT: lw a0, 20(a5) @@ -575,30 +575,30 @@ define void @callee() { ; RV32IZCMP-WITH-FP-NEXT: lw s10, 68(a5) ; RV32IZCMP-WITH-FP-NEXT: lw s11, 72(a5) ; RV32IZCMP-WITH-FP-NEXT: lw ra, 76(a5) -; RV32IZCMP-WITH-FP-NEXT: lw t4, 80(a5) -; RV32IZCMP-WITH-FP-NEXT: lw t3, 84(a5) -; RV32IZCMP-WITH-FP-NEXT: lw t2, 88(a5) +; RV32IZCMP-WITH-FP-NEXT: lw t3, 80(a5) +; RV32IZCMP-WITH-FP-NEXT: lw t2, 84(a5) +; RV32IZCMP-WITH-FP-NEXT: lw t1, 88(a5) ; RV32IZCMP-WITH-FP-NEXT: lw s1, 92(a5) -; RV32IZCMP-WITH-FP-NEXT: lw t0, 112(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a4, 116(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a3, 120(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a0, 124(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a7, 96(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a6, 100(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a2, 104(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a1, 108(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a0, 124(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a3, 120(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a4, 116(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t0, 112(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a1, 108(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a2, 104(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a6, 100(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a7, 96(a5) +; RV32IZCMP-WITH-FP-NEXT: lw t0, 96(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a7, 100(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a6, 104(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a3, 108(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a2, 112(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a1, 116(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a0, 120(a5) +; RV32IZCMP-WITH-FP-NEXT: lw t4, 124(a5) +; RV32IZCMP-WITH-FP-NEXT: sw t4, 124(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a0, 120(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a1, 116(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a2, 112(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a3, 108(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a6, 104(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a7, 100(a5) +; RV32IZCMP-WITH-FP-NEXT: sw t0, 96(a5) ; RV32IZCMP-WITH-FP-NEXT: sw s1, 92(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t2, 88(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t3, 84(a5) -; RV32IZCMP-WITH-FP-NEXT: sw t4, 80(a5) +; RV32IZCMP-WITH-FP-NEXT: sw t1, 88(a5) +; RV32IZCMP-WITH-FP-NEXT: sw t2, 84(a5) +; RV32IZCMP-WITH-FP-NEXT: sw t3, 80(a5) ; RV32IZCMP-WITH-FP-NEXT: sw ra, 76(a5) ; RV32IZCMP-WITH-FP-NEXT: sw s11, 72(a5) ; RV32IZCMP-WITH-FP-NEXT: sw s10, 68(a5) @@ -619,13 +619,13 @@ define void @callee() { ; RV32IZCMP-WITH-FP-NEXT: lw a0, -72(s0) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: sw a0, 16(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -68(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(t1) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a4) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -64(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(t1) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a4) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -60(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(t1) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a4) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(t1) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a4) ; RV32IZCMP-WITH-FP-NEXT: .cfi_def_cfa sp, 80 ; RV32IZCMP-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload @@ -687,16 +687,16 @@ define void @callee() { ; RV64I-NEXT: .cfi_offset s9, -88 ; RV64I-NEXT: .cfi_offset s10, -96 ; RV64I-NEXT: .cfi_offset s11, -104 -; RV64I-NEXT: lui a7, %hi(var) -; RV64I-NEXT: lw a0, %lo(var)(a7) +; RV64I-NEXT: lui a4, %hi(var) +; RV64I-NEXT: lw a0, %lo(var)(a4) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+4)(a7) +; RV64I-NEXT: lw a0, %lo(var+4)(a4) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+8)(a7) +; RV64I-NEXT: lw a0, %lo(var+8)(a4) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+12)(a7) +; RV64I-NEXT: lw a0, %lo(var+12)(a4) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a7, %lo(var) +; RV64I-NEXT: addi a5, a4, %lo(var) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -719,22 +719,22 @@ define void @callee() { ; RV64I-NEXT: lw s8, 84(a5) ; RV64I-NEXT: lw s9, 88(a5) ; RV64I-NEXT: lw s10, 92(a5) -; RV64I-NEXT: lw s11, 112(a5) -; RV64I-NEXT: lw ra, 116(a5) -; RV64I-NEXT: lw a3, 120(a5) -; RV64I-NEXT: lw a0, 124(a5) -; RV64I-NEXT: lw a6, 96(a5) -; RV64I-NEXT: lw a4, 100(a5) -; RV64I-NEXT: lw a2, 104(a5) -; RV64I-NEXT: lw a1, 108(a5) -; RV64I-NEXT: sw a0, 124(a5) -; RV64I-NEXT: sw a3, 120(a5) -; RV64I-NEXT: sw ra, 116(a5) -; RV64I-NEXT: sw s11, 112(a5) -; RV64I-NEXT: sw a1, 108(a5) -; RV64I-NEXT: sw a2, 104(a5) -; RV64I-NEXT: sw a4, 100(a5) -; RV64I-NEXT: sw a6, 96(a5) +; RV64I-NEXT: lw s11, 96(a5) +; RV64I-NEXT: lw ra, 100(a5) +; RV64I-NEXT: lw a6, 104(a5) +; RV64I-NEXT: lw a3, 108(a5) +; RV64I-NEXT: lw a2, 112(a5) +; RV64I-NEXT: lw a1, 116(a5) +; RV64I-NEXT: lw a0, 120(a5) +; RV64I-NEXT: lw a7, 124(a5) +; RV64I-NEXT: sw a7, 124(a5) +; RV64I-NEXT: sw a0, 120(a5) +; RV64I-NEXT: sw a1, 116(a5) +; RV64I-NEXT: sw a2, 112(a5) +; RV64I-NEXT: sw a3, 108(a5) +; RV64I-NEXT: sw a6, 104(a5) +; RV64I-NEXT: sw ra, 100(a5) +; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) ; RV64I-NEXT: sw s9, 88(a5) ; RV64I-NEXT: sw s8, 84(a5) @@ -758,13 +758,13 @@ define void @callee() { ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+12)(a7) +; RV64I-NEXT: sw a0, %lo(var+12)(a4) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+8)(a7) +; RV64I-NEXT: sw a0, %lo(var+8)(a4) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+4)(a7) +; RV64I-NEXT: sw a0, %lo(var+4)(a4) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var)(a7) +; RV64I-NEXT: sw a0, %lo(var)(a4) ; RV64I-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 136(sp) # 8-byte Folded Reload @@ -805,16 +805,16 @@ define void @callee() { ; RV64I-LP64E-NEXT: .cfi_offset ra, -8 ; RV64I-LP64E-NEXT: .cfi_offset s0, -16 ; RV64I-LP64E-NEXT: .cfi_offset s1, -24 -; RV64I-LP64E-NEXT: lui a7, %hi(var) -; RV64I-LP64E-NEXT: lw a0, %lo(var)(a7) +; RV64I-LP64E-NEXT: lui a4, %hi(var) +; RV64I-LP64E-NEXT: lw a0, %lo(var)(a4) ; RV64I-LP64E-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-LP64E-NEXT: lw a0, %lo(var+4)(a7) +; RV64I-LP64E-NEXT: lw a0, %lo(var+4)(a4) ; RV64I-LP64E-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-LP64E-NEXT: lw a0, %lo(var+8)(a7) +; RV64I-LP64E-NEXT: lw a0, %lo(var+8)(a4) ; RV64I-LP64E-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-LP64E-NEXT: lw a0, %lo(var+12)(a7) +; RV64I-LP64E-NEXT: lw a0, %lo(var+12)(a4) ; RV64I-LP64E-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-LP64E-NEXT: addi a5, a7, %lo(var) +; RV64I-LP64E-NEXT: addi a5, a4, %lo(var) ; RV64I-LP64E-NEXT: lw a0, 16(a5) ; RV64I-LP64E-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64I-LP64E-NEXT: lw a0, 20(a5) @@ -837,22 +837,22 @@ define void @callee() { ; RV64I-LP64E-NEXT: lw s10, 84(a5) ; RV64I-LP64E-NEXT: lw s11, 88(a5) ; RV64I-LP64E-NEXT: lw s0, 92(a5) -; RV64I-LP64E-NEXT: lw s1, 112(a5) -; RV64I-LP64E-NEXT: lw ra, 116(a5) -; RV64I-LP64E-NEXT: lw a3, 120(a5) -; RV64I-LP64E-NEXT: lw a0, 124(a5) -; RV64I-LP64E-NEXT: lw a6, 96(a5) -; RV64I-LP64E-NEXT: lw a4, 100(a5) -; RV64I-LP64E-NEXT: lw a2, 104(a5) -; RV64I-LP64E-NEXT: lw a1, 108(a5) -; RV64I-LP64E-NEXT: sw a0, 124(a5) -; RV64I-LP64E-NEXT: sw a3, 120(a5) -; RV64I-LP64E-NEXT: sw ra, 116(a5) -; RV64I-LP64E-NEXT: sw s1, 112(a5) -; RV64I-LP64E-NEXT: sw a1, 108(a5) -; RV64I-LP64E-NEXT: sw a2, 104(a5) -; RV64I-LP64E-NEXT: sw a4, 100(a5) -; RV64I-LP64E-NEXT: sw a6, 96(a5) +; RV64I-LP64E-NEXT: lw s1, 96(a5) +; RV64I-LP64E-NEXT: lw ra, 100(a5) +; RV64I-LP64E-NEXT: lw a6, 104(a5) +; RV64I-LP64E-NEXT: lw a3, 108(a5) +; RV64I-LP64E-NEXT: lw a2, 112(a5) +; RV64I-LP64E-NEXT: lw a1, 116(a5) +; RV64I-LP64E-NEXT: lw a0, 120(a5) +; RV64I-LP64E-NEXT: lw a7, 124(a5) +; RV64I-LP64E-NEXT: sw a7, 124(a5) +; RV64I-LP64E-NEXT: sw a0, 120(a5) +; RV64I-LP64E-NEXT: sw a1, 116(a5) +; RV64I-LP64E-NEXT: sw a2, 112(a5) +; RV64I-LP64E-NEXT: sw a3, 108(a5) +; RV64I-LP64E-NEXT: sw a6, 104(a5) +; RV64I-LP64E-NEXT: sw ra, 100(a5) +; RV64I-LP64E-NEXT: sw s1, 96(a5) ; RV64I-LP64E-NEXT: sw s0, 92(a5) ; RV64I-LP64E-NEXT: sw s11, 88(a5) ; RV64I-LP64E-NEXT: sw s10, 84(a5) @@ -876,13 +876,13 @@ define void @callee() { ; RV64I-LP64E-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64I-LP64E-NEXT: sw a0, 16(a5) ; RV64I-LP64E-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-LP64E-NEXT: sw a0, %lo(var+12)(a7) +; RV64I-LP64E-NEXT: sw a0, %lo(var+12)(a4) ; RV64I-LP64E-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-LP64E-NEXT: sw a0, %lo(var+8)(a7) +; RV64I-LP64E-NEXT: sw a0, %lo(var+8)(a4) ; RV64I-LP64E-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-LP64E-NEXT: sw a0, %lo(var+4)(a7) +; RV64I-LP64E-NEXT: sw a0, %lo(var+4)(a4) ; RV64I-LP64E-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-LP64E-NEXT: sw a0, %lo(var)(a7) +; RV64I-LP64E-NEXT: sw a0, %lo(var)(a4) ; RV64I-LP64E-NEXT: ld ra, 64(sp) # 8-byte Folded Reload ; RV64I-LP64E-NEXT: ld s0, 56(sp) # 8-byte Folded Reload ; RV64I-LP64E-NEXT: ld s1, 48(sp) # 8-byte Folded Reload @@ -925,16 +925,16 @@ define void @callee() { ; RV64I-WITH-FP-NEXT: .cfi_offset s11, -104 ; RV64I-WITH-FP-NEXT: addi s0, sp, 160 ; RV64I-WITH-FP-NEXT: .cfi_def_cfa s0, 0 -; RV64I-WITH-FP-NEXT: lui t0, %hi(var) -; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(t0) +; RV64I-WITH-FP-NEXT: lui a4, %hi(var) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(a4) ; RV64I-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+4)(t0) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+4)(a4) ; RV64I-WITH-FP-NEXT: sd a0, -120(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+8)(t0) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+8)(a4) ; RV64I-WITH-FP-NEXT: sd a0, -128(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+12)(t0) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+12)(a4) ; RV64I-WITH-FP-NEXT: sd a0, -136(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: addi a5, t0, %lo(var) +; RV64I-WITH-FP-NEXT: addi a5, a4, %lo(var) ; RV64I-WITH-FP-NEXT: lw a0, 16(a5) ; RV64I-WITH-FP-NEXT: sd a0, -144(s0) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: lw a0, 20(a5) @@ -958,22 +958,22 @@ define void @callee() { ; RV64I-WITH-FP-NEXT: lw s9, 84(a5) ; RV64I-WITH-FP-NEXT: lw s10, 88(a5) ; RV64I-WITH-FP-NEXT: lw s11, 92(a5) -; RV64I-WITH-FP-NEXT: lw ra, 112(a5) -; RV64I-WITH-FP-NEXT: lw a4, 116(a5) -; RV64I-WITH-FP-NEXT: lw a3, 120(a5) -; RV64I-WITH-FP-NEXT: lw a0, 124(a5) -; RV64I-WITH-FP-NEXT: lw a7, 96(a5) -; RV64I-WITH-FP-NEXT: lw a6, 100(a5) -; RV64I-WITH-FP-NEXT: lw a2, 104(a5) -; RV64I-WITH-FP-NEXT: lw a1, 108(a5) -; RV64I-WITH-FP-NEXT: sw a0, 124(a5) -; RV64I-WITH-FP-NEXT: sw a3, 120(a5) -; RV64I-WITH-FP-NEXT: sw a4, 116(a5) -; RV64I-WITH-FP-NEXT: sw ra, 112(a5) -; RV64I-WITH-FP-NEXT: sw a1, 108(a5) -; RV64I-WITH-FP-NEXT: sw a2, 104(a5) -; RV64I-WITH-FP-NEXT: sw a6, 100(a5) -; RV64I-WITH-FP-NEXT: sw a7, 96(a5) +; RV64I-WITH-FP-NEXT: lw ra, 96(a5) +; RV64I-WITH-FP-NEXT: lw a7, 100(a5) +; RV64I-WITH-FP-NEXT: lw a6, 104(a5) +; RV64I-WITH-FP-NEXT: lw a3, 108(a5) +; RV64I-WITH-FP-NEXT: lw a2, 112(a5) +; RV64I-WITH-FP-NEXT: lw a1, 116(a5) +; RV64I-WITH-FP-NEXT: lw a0, 120(a5) +; RV64I-WITH-FP-NEXT: lw t0, 124(a5) +; RV64I-WITH-FP-NEXT: sw t0, 124(a5) +; RV64I-WITH-FP-NEXT: sw a0, 120(a5) +; RV64I-WITH-FP-NEXT: sw a1, 116(a5) +; RV64I-WITH-FP-NEXT: sw a2, 112(a5) +; RV64I-WITH-FP-NEXT: sw a3, 108(a5) +; RV64I-WITH-FP-NEXT: sw a6, 104(a5) +; RV64I-WITH-FP-NEXT: sw a7, 100(a5) +; RV64I-WITH-FP-NEXT: sw ra, 96(a5) ; RV64I-WITH-FP-NEXT: sw s11, 92(a5) ; RV64I-WITH-FP-NEXT: sw s10, 88(a5) ; RV64I-WITH-FP-NEXT: sw s9, 84(a5) @@ -998,13 +998,13 @@ define void @callee() { ; RV64I-WITH-FP-NEXT: ld a0, -144(s0) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: sw a0, 16(a5) ; RV64I-WITH-FP-NEXT: ld a0, -136(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+12)(t0) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+12)(a4) ; RV64I-WITH-FP-NEXT: ld a0, -128(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+8)(t0) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+8)(a4) ; RV64I-WITH-FP-NEXT: ld a0, -120(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+4)(t0) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+4)(a4) ; RV64I-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(t0) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(a4) ; RV64I-WITH-FP-NEXT: .cfi_def_cfa sp, 160 ; RV64I-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: ld s0, 144(sp) # 8-byte Folded Reload @@ -1053,16 +1053,16 @@ define void @callee() { ; RV64IZCMP-NEXT: .cfi_offset s9, -24 ; RV64IZCMP-NEXT: .cfi_offset s10, -16 ; RV64IZCMP-NEXT: .cfi_offset s11, -8 -; RV64IZCMP-NEXT: lui t0, %hi(var) -; RV64IZCMP-NEXT: lw a0, %lo(var)(t0) +; RV64IZCMP-NEXT: lui a4, %hi(var) +; RV64IZCMP-NEXT: lw a0, %lo(var)(a4) ; RV64IZCMP-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+4)(t0) +; RV64IZCMP-NEXT: lw a0, %lo(var+4)(a4) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+8)(t0) +; RV64IZCMP-NEXT: lw a0, %lo(var+8)(a4) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+12)(t0) +; RV64IZCMP-NEXT: lw a0, %lo(var+12)(a4) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, t0, %lo(var) +; RV64IZCMP-NEXT: addi a5, a4, %lo(var) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -1082,28 +1082,28 @@ define void @callee() { ; RV64IZCMP-NEXT: lw s11, 72(a5) ; RV64IZCMP-NEXT: lw ra, 76(a5) ; RV64IZCMP-NEXT: lw s1, 80(a5) -; RV64IZCMP-NEXT: lw t3, 84(a5) -; RV64IZCMP-NEXT: lw t2, 88(a5) -; RV64IZCMP-NEXT: lw t1, 92(a5) -; RV64IZCMP-NEXT: lw a7, 112(a5) -; RV64IZCMP-NEXT: lw s0, 116(a5) -; RV64IZCMP-NEXT: lw a3, 120(a5) -; RV64IZCMP-NEXT: lw a0, 124(a5) -; RV64IZCMP-NEXT: lw a6, 96(a5) -; RV64IZCMP-NEXT: lw a4, 100(a5) -; RV64IZCMP-NEXT: lw a2, 104(a5) -; RV64IZCMP-NEXT: lw a1, 108(a5) -; RV64IZCMP-NEXT: sw a0, 124(a5) -; RV64IZCMP-NEXT: sw a3, 120(a5) -; RV64IZCMP-NEXT: sw s0, 116(a5) -; RV64IZCMP-NEXT: sw a7, 112(a5) -; RV64IZCMP-NEXT: sw a1, 108(a5) -; RV64IZCMP-NEXT: sw a2, 104(a5) -; RV64IZCMP-NEXT: sw a4, 100(a5) -; RV64IZCMP-NEXT: sw a6, 96(a5) -; RV64IZCMP-NEXT: sw t1, 92(a5) -; RV64IZCMP-NEXT: sw t2, 88(a5) -; RV64IZCMP-NEXT: sw t3, 84(a5) +; RV64IZCMP-NEXT: lw t2, 84(a5) +; RV64IZCMP-NEXT: lw t1, 88(a5) +; RV64IZCMP-NEXT: lw t0, 92(a5) +; RV64IZCMP-NEXT: lw a7, 96(a5) +; RV64IZCMP-NEXT: lw s0, 100(a5) +; RV64IZCMP-NEXT: lw a6, 104(a5) +; RV64IZCMP-NEXT: lw a3, 108(a5) +; RV64IZCMP-NEXT: lw a2, 112(a5) +; RV64IZCMP-NEXT: lw a1, 116(a5) +; RV64IZCMP-NEXT: lw a0, 120(a5) +; RV64IZCMP-NEXT: lw t3, 124(a5) +; RV64IZCMP-NEXT: sw t3, 124(a5) +; RV64IZCMP-NEXT: sw a0, 120(a5) +; RV64IZCMP-NEXT: sw a1, 116(a5) +; RV64IZCMP-NEXT: sw a2, 112(a5) +; RV64IZCMP-NEXT: sw a3, 108(a5) +; RV64IZCMP-NEXT: sw a6, 104(a5) +; RV64IZCMP-NEXT: sw s0, 100(a5) +; RV64IZCMP-NEXT: sw a7, 96(a5) +; RV64IZCMP-NEXT: sw t0, 92(a5) +; RV64IZCMP-NEXT: sw t1, 88(a5) +; RV64IZCMP-NEXT: sw t2, 84(a5) ; RV64IZCMP-NEXT: sw s1, 80(a5) ; RV64IZCMP-NEXT: sw ra, 76(a5) ; RV64IZCMP-NEXT: sw s11, 72(a5) @@ -1124,13 +1124,13 @@ define void @callee() { ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+12)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var+12)(a4) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+8)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var+8)(a4) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+4)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var+4)(a4) ; RV64IZCMP-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var)(a4) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV64IZCMP-WITH-FP-LABEL: callee: @@ -1165,16 +1165,16 @@ define void @callee() { ; RV64IZCMP-WITH-FP-NEXT: .cfi_offset s11, -104 ; RV64IZCMP-WITH-FP-NEXT: addi s0, sp, 160 ; RV64IZCMP-WITH-FP-NEXT: .cfi_def_cfa s0, 0 -; RV64IZCMP-WITH-FP-NEXT: lui t1, %hi(var) -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(t1) +; RV64IZCMP-WITH-FP-NEXT: lui a4, %hi(var) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a4) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(t1) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a4) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -120(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(t1) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a4) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -128(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(t1) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a4) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -136(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: addi a5, t1, %lo(var) +; RV64IZCMP-WITH-FP-NEXT: addi a5, a4, %lo(var) ; RV64IZCMP-WITH-FP-NEXT: lw a0, 16(a5) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -144(s0) # 8-byte Folded Spill ; RV64IZCMP-WITH-FP-NEXT: lw a0, 20(a5) @@ -1194,30 +1194,30 @@ define void @callee() { ; RV64IZCMP-WITH-FP-NEXT: lw s10, 68(a5) ; RV64IZCMP-WITH-FP-NEXT: lw s11, 72(a5) ; RV64IZCMP-WITH-FP-NEXT: lw ra, 76(a5) -; RV64IZCMP-WITH-FP-NEXT: lw t4, 80(a5) -; RV64IZCMP-WITH-FP-NEXT: lw t3, 84(a5) -; RV64IZCMP-WITH-FP-NEXT: lw t2, 88(a5) +; RV64IZCMP-WITH-FP-NEXT: lw t3, 80(a5) +; RV64IZCMP-WITH-FP-NEXT: lw t2, 84(a5) +; RV64IZCMP-WITH-FP-NEXT: lw t1, 88(a5) ; RV64IZCMP-WITH-FP-NEXT: lw s1, 92(a5) -; RV64IZCMP-WITH-FP-NEXT: lw t0, 112(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a4, 116(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a3, 120(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a0, 124(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a7, 96(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a6, 100(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a2, 104(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a1, 108(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a0, 124(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a3, 120(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a4, 116(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t0, 112(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a1, 108(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a2, 104(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a6, 100(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a7, 96(a5) +; RV64IZCMP-WITH-FP-NEXT: lw t0, 96(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a7, 100(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a6, 104(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a3, 108(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a2, 112(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a1, 116(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a0, 120(a5) +; RV64IZCMP-WITH-FP-NEXT: lw t4, 124(a5) +; RV64IZCMP-WITH-FP-NEXT: sw t4, 124(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a0, 120(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a1, 116(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a2, 112(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a3, 108(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a6, 104(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a7, 100(a5) +; RV64IZCMP-WITH-FP-NEXT: sw t0, 96(a5) ; RV64IZCMP-WITH-FP-NEXT: sw s1, 92(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t2, 88(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t3, 84(a5) -; RV64IZCMP-WITH-FP-NEXT: sw t4, 80(a5) +; RV64IZCMP-WITH-FP-NEXT: sw t1, 88(a5) +; RV64IZCMP-WITH-FP-NEXT: sw t2, 84(a5) +; RV64IZCMP-WITH-FP-NEXT: sw t3, 80(a5) ; RV64IZCMP-WITH-FP-NEXT: sw ra, 76(a5) ; RV64IZCMP-WITH-FP-NEXT: sw s11, 72(a5) ; RV64IZCMP-WITH-FP-NEXT: sw s10, 68(a5) @@ -1238,13 +1238,13 @@ define void @callee() { ; RV64IZCMP-WITH-FP-NEXT: ld a0, -144(s0) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: sw a0, 16(a5) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -136(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(t1) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a4) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -128(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(t1) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a4) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -120(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(t1) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a4) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(t1) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a4) ; RV64IZCMP-WITH-FP-NEXT: .cfi_def_cfa sp, 160 ; RV64IZCMP-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: ld s0, 144(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/calling-conv-half.ll index 541c9b4d40c7e..aa08c3f5c95b1 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-half.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-half.ll @@ -225,8 +225,8 @@ define i32 @callee_half_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, ; RV32I-NEXT: addi sp, sp, -16 ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lhu a0, 16(sp) ; RV32I-NEXT: mv s0, a7 +; RV32I-NEXT: lhu a0, 16(sp) ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: call __fixsfsi ; RV32I-NEXT: add a0, s0, a0 @@ -240,8 +240,8 @@ define i32 @callee_half_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lhu a0, 16(sp) ; RV64I-NEXT: mv s0, a7 +; RV64I-NEXT: lhu a0, 16(sp) ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: call __fixsfdi ; RV64I-NEXT: addw a0, s0, a0 @@ -255,8 +255,8 @@ define i32 @callee_half_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, ; RV32IF-NEXT: addi sp, sp, -16 ; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IF-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; RV32IF-NEXT: lhu a0, 16(sp) ; RV32IF-NEXT: mv s0, a7 +; RV32IF-NEXT: lhu a0, 16(sp) ; RV32IF-NEXT: call __extendhfsf2 ; RV32IF-NEXT: fmv.w.x fa5, a0 ; RV32IF-NEXT: fcvt.w.s a0, fa5, rtz @@ -271,8 +271,8 @@ define i32 @callee_half_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, ; RV64IF-NEXT: addi sp, sp, -16 ; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64IF-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64IF-NEXT: lhu a0, 16(sp) ; RV64IF-NEXT: mv s0, a7 +; RV64IF-NEXT: lhu a0, 16(sp) ; RV64IF-NEXT: call __extendhfsf2 ; RV64IF-NEXT: fmv.w.x fa5, a0 ; RV64IF-NEXT: fcvt.l.s a0, fa5, rtz @@ -341,9 +341,9 @@ define i32 @caller_half_on_stack() nounwind { ; RV32I-NEXT: li a4, 5 ; RV32I-NEXT: li a5, 6 ; RV32I-NEXT: li a6, 7 -; RV32I-NEXT: addi t0, a7, -1792 +; RV32I-NEXT: addi a7, a7, -1792 +; RV32I-NEXT: sw a7, 0(sp) ; RV32I-NEXT: li a7, 8 -; RV32I-NEXT: sw t0, 0(sp) ; RV32I-NEXT: call callee_half_on_stack ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 @@ -361,9 +361,9 @@ define i32 @caller_half_on_stack() nounwind { ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: li a5, 6 ; RV64I-NEXT: li a6, 7 -; RV64I-NEXT: addiw t0, a7, -1792 +; RV64I-NEXT: addiw a7, a7, -1792 +; RV64I-NEXT: sd a7, 0(sp) ; RV64I-NEXT: li a7, 8 -; RV64I-NEXT: sd t0, 0(sp) ; RV64I-NEXT: call callee_half_on_stack ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 @@ -381,9 +381,9 @@ define i32 @caller_half_on_stack() nounwind { ; RV32IF-NEXT: li a4, 5 ; RV32IF-NEXT: li a5, 6 ; RV32IF-NEXT: li a6, 7 -; RV32IF-NEXT: addi t0, a7, -1792 +; RV32IF-NEXT: addi a7, a7, -1792 +; RV32IF-NEXT: sw a7, 0(sp) ; RV32IF-NEXT: li a7, 8 -; RV32IF-NEXT: sw t0, 0(sp) ; RV32IF-NEXT: call callee_half_on_stack ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 16 @@ -401,9 +401,9 @@ define i32 @caller_half_on_stack() nounwind { ; RV64IF-NEXT: li a4, 5 ; RV64IF-NEXT: li a5, 6 ; RV64IF-NEXT: li a6, 7 -; RV64IF-NEXT: addi t0, a7, -1792 +; RV64IF-NEXT: addi a7, a7, -1792 +; RV64IF-NEXT: sw a7, 0(sp) ; RV64IF-NEXT: li a7, 8 -; RV64IF-NEXT: sw t0, 0(sp) ; RV64IF-NEXT: call callee_half_on_stack ; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64IF-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll index 9387b7ef4c32e..6697cd0e503e7 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll @@ -94,15 +94,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 % ; RV32I-FPELIM-LABEL: callee_aligned_stack: ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: lw a0, 0(a2) -; RV32I-FPELIM-NEXT: lw a1, 8(sp) -; RV32I-FPELIM-NEXT: lw a2, 0(sp) -; RV32I-FPELIM-NEXT: lw a3, 20(sp) +; RV32I-FPELIM-NEXT: lw a1, 20(sp) +; RV32I-FPELIM-NEXT: lw a2, 8(sp) +; RV32I-FPELIM-NEXT: lw a3, 0(sp) ; RV32I-FPELIM-NEXT: lw a4, 16(sp) ; RV32I-FPELIM-NEXT: add a0, a0, a7 -; RV32I-FPELIM-NEXT: add a1, a2, a1 +; RV32I-FPELIM-NEXT: add a2, a3, a2 +; RV32I-FPELIM-NEXT: add a0, a0, a2 +; RV32I-FPELIM-NEXT: add a1, a4, a1 ; RV32I-FPELIM-NEXT: add a0, a0, a1 -; RV32I-FPELIM-NEXT: add a3, a4, a3 -; RV32I-FPELIM-NEXT: add a0, a0, a3 ; RV32I-FPELIM-NEXT: ret ; ; RV32I-WITHFP-LABEL: callee_aligned_stack: @@ -112,15 +112,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 % ; RV32I-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 16 ; RV32I-WITHFP-NEXT: lw a0, 0(a2) -; RV32I-WITHFP-NEXT: lw a1, 8(s0) -; RV32I-WITHFP-NEXT: lw a2, 0(s0) -; RV32I-WITHFP-NEXT: lw a3, 20(s0) +; RV32I-WITHFP-NEXT: lw a1, 20(s0) +; RV32I-WITHFP-NEXT: lw a2, 8(s0) +; RV32I-WITHFP-NEXT: lw a3, 0(s0) ; RV32I-WITHFP-NEXT: lw a4, 16(s0) ; RV32I-WITHFP-NEXT: add a0, a0, a7 -; RV32I-WITHFP-NEXT: add a1, a2, a1 +; RV32I-WITHFP-NEXT: add a2, a3, a2 +; RV32I-WITHFP-NEXT: add a0, a0, a2 +; RV32I-WITHFP-NEXT: add a1, a4, a1 ; RV32I-WITHFP-NEXT: add a0, a0, a1 -; RV32I-WITHFP-NEXT: add a3, a4, a3 -; RV32I-WITHFP-NEXT: add a0, a0, a3 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: addi sp, sp, 16 @@ -145,45 +145,43 @@ define void @caller_aligned_stack() nounwind { ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -64 ; RV32I-FPELIM-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32I-FPELIM-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32I-FPELIM-NEXT: li a5, 18 -; RV32I-FPELIM-NEXT: li a6, 17 -; RV32I-FPELIM-NEXT: li a7, 16 -; RV32I-FPELIM-NEXT: lui t0, 262236 -; RV32I-FPELIM-NEXT: lui t1, 377487 -; RV32I-FPELIM-NEXT: li t2, 15 -; RV32I-FPELIM-NEXT: lui t3, 262153 -; RV32I-FPELIM-NEXT: lui t4, 545260 -; RV32I-FPELIM-NEXT: lui t5, 964690 -; RV32I-FPELIM-NEXT: lui t6, 335544 -; RV32I-FPELIM-NEXT: lui s0, 688509 +; RV32I-FPELIM-NEXT: li a4, 18 +; RV32I-FPELIM-NEXT: li a5, 17 +; RV32I-FPELIM-NEXT: li a6, 16 +; RV32I-FPELIM-NEXT: lui a7, 262236 +; RV32I-FPELIM-NEXT: lui t0, 377487 +; RV32I-FPELIM-NEXT: li t1, 15 +; RV32I-FPELIM-NEXT: lui t2, 262153 +; RV32I-FPELIM-NEXT: lui t3, 545260 +; RV32I-FPELIM-NEXT: lui t4, 964690 +; RV32I-FPELIM-NEXT: lui t5, 335544 +; RV32I-FPELIM-NEXT: lui t6, 688509 ; RV32I-FPELIM-NEXT: li a0, 1 ; RV32I-FPELIM-NEXT: li a1, 11 ; RV32I-FPELIM-NEXT: addi a2, sp, 32 ; RV32I-FPELIM-NEXT: li a3, 12 +; RV32I-FPELIM-NEXT: sw a5, 20(sp) +; RV32I-FPELIM-NEXT: sw a4, 24(sp) ; RV32I-FPELIM-NEXT: li a4, 13 -; RV32I-FPELIM-NEXT: sw a6, 20(sp) -; RV32I-FPELIM-NEXT: sw a5, 24(sp) -; RV32I-FPELIM-NEXT: li a6, 4 -; RV32I-FPELIM-NEXT: addi a5, t0, 655 -; RV32I-FPELIM-NEXT: addi t0, t1, 1475 -; RV32I-FPELIM-NEXT: sw t2, 0(sp) -; RV32I-FPELIM-NEXT: sw t0, 8(sp) +; RV32I-FPELIM-NEXT: addi a5, a7, 655 +; RV32I-FPELIM-NEXT: addi a7, t0, 1475 +; RV32I-FPELIM-NEXT: sw t1, 0(sp) +; RV32I-FPELIM-NEXT: sw a7, 8(sp) ; RV32I-FPELIM-NEXT: sw a5, 12(sp) -; RV32I-FPELIM-NEXT: sw a7, 16(sp) +; RV32I-FPELIM-NEXT: sw a6, 16(sp) +; RV32I-FPELIM-NEXT: li a6, 4 +; RV32I-FPELIM-NEXT: addi a7, t2, 491 +; RV32I-FPELIM-NEXT: addi t0, t3, -1967 +; RV32I-FPELIM-NEXT: addi t1, t4, -328 +; RV32I-FPELIM-NEXT: addi t2, t5, 1311 +; RV32I-FPELIM-NEXT: addi a5, t6, -2048 +; RV32I-FPELIM-NEXT: sw t2, 32(sp) +; RV32I-FPELIM-NEXT: sw t1, 36(sp) +; RV32I-FPELIM-NEXT: sw t0, 40(sp) +; RV32I-FPELIM-NEXT: sw a7, 44(sp) ; RV32I-FPELIM-NEXT: li a7, 14 -; RV32I-FPELIM-NEXT: addi t0, t3, 491 -; RV32I-FPELIM-NEXT: addi t1, t4, -1967 -; RV32I-FPELIM-NEXT: addi t2, t5, -328 -; RV32I-FPELIM-NEXT: addi t3, t6, 1311 -; RV32I-FPELIM-NEXT: addi a5, s0, -2048 -; RV32I-FPELIM-NEXT: sw t3, 32(sp) -; RV32I-FPELIM-NEXT: sw t2, 36(sp) -; RV32I-FPELIM-NEXT: sw t1, 40(sp) -; RV32I-FPELIM-NEXT: sw t0, 44(sp) ; RV32I-FPELIM-NEXT: call callee_aligned_stack ; RV32I-FPELIM-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; RV32I-FPELIM-NEXT: lw s0, 56(sp) # 4-byte Folded Reload ; RV32I-FPELIM-NEXT: addi sp, sp, 64 ; RV32I-FPELIM-NEXT: ret ; @@ -192,47 +190,45 @@ define void @caller_aligned_stack() nounwind { ; RV32I-WITHFP-NEXT: addi sp, sp, -64 ; RV32I-WITHFP-NEXT: sw ra, 60(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; RV32I-WITHFP-NEXT: sw s1, 52(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 64 -; RV32I-WITHFP-NEXT: li a5, 18 -; RV32I-WITHFP-NEXT: li a6, 17 -; RV32I-WITHFP-NEXT: li a7, 16 -; RV32I-WITHFP-NEXT: lui t0, 262236 -; RV32I-WITHFP-NEXT: lui t1, 377487 -; RV32I-WITHFP-NEXT: li t2, 15 -; RV32I-WITHFP-NEXT: lui t3, 262153 -; RV32I-WITHFP-NEXT: lui t4, 545260 -; RV32I-WITHFP-NEXT: lui t5, 964690 -; RV32I-WITHFP-NEXT: lui t6, 335544 -; RV32I-WITHFP-NEXT: lui s1, 688509 +; RV32I-WITHFP-NEXT: li a4, 18 +; RV32I-WITHFP-NEXT: li a5, 17 +; RV32I-WITHFP-NEXT: li a6, 16 +; RV32I-WITHFP-NEXT: lui a7, 262236 +; RV32I-WITHFP-NEXT: lui t0, 377487 +; RV32I-WITHFP-NEXT: li t1, 15 +; RV32I-WITHFP-NEXT: lui t2, 262153 +; RV32I-WITHFP-NEXT: lui t3, 545260 +; RV32I-WITHFP-NEXT: lui t4, 964690 +; RV32I-WITHFP-NEXT: lui t5, 335544 +; RV32I-WITHFP-NEXT: lui t6, 688509 ; RV32I-WITHFP-NEXT: li a0, 1 ; RV32I-WITHFP-NEXT: li a1, 11 ; RV32I-WITHFP-NEXT: addi a2, s0, -32 ; RV32I-WITHFP-NEXT: li a3, 12 +; RV32I-WITHFP-NEXT: sw a5, 20(sp) +; RV32I-WITHFP-NEXT: sw a4, 24(sp) ; RV32I-WITHFP-NEXT: li a4, 13 -; RV32I-WITHFP-NEXT: sw a6, 20(sp) -; RV32I-WITHFP-NEXT: sw a5, 24(sp) -; RV32I-WITHFP-NEXT: li a6, 4 -; RV32I-WITHFP-NEXT: addi a5, t0, 655 -; RV32I-WITHFP-NEXT: addi t0, t1, 1475 -; RV32I-WITHFP-NEXT: sw t2, 0(sp) -; RV32I-WITHFP-NEXT: sw t0, 8(sp) +; RV32I-WITHFP-NEXT: addi a5, a7, 655 +; RV32I-WITHFP-NEXT: addi a7, t0, 1475 +; RV32I-WITHFP-NEXT: sw t1, 0(sp) +; RV32I-WITHFP-NEXT: sw a7, 8(sp) ; RV32I-WITHFP-NEXT: sw a5, 12(sp) -; RV32I-WITHFP-NEXT: sw a7, 16(sp) +; RV32I-WITHFP-NEXT: sw a6, 16(sp) +; RV32I-WITHFP-NEXT: li a6, 4 +; RV32I-WITHFP-NEXT: addi a7, t2, 491 +; RV32I-WITHFP-NEXT: addi t0, t3, -1967 +; RV32I-WITHFP-NEXT: addi t1, t4, -328 +; RV32I-WITHFP-NEXT: addi t2, t5, 1311 +; RV32I-WITHFP-NEXT: addi a5, t6, -2048 +; RV32I-WITHFP-NEXT: sw t2, -32(s0) +; RV32I-WITHFP-NEXT: sw t1, -28(s0) +; RV32I-WITHFP-NEXT: sw t0, -24(s0) +; RV32I-WITHFP-NEXT: sw a7, -20(s0) ; RV32I-WITHFP-NEXT: li a7, 14 -; RV32I-WITHFP-NEXT: addi t0, t3, 491 -; RV32I-WITHFP-NEXT: addi t1, t4, -1967 -; RV32I-WITHFP-NEXT: addi t2, t5, -328 -; RV32I-WITHFP-NEXT: addi t3, t6, 1311 -; RV32I-WITHFP-NEXT: addi a5, s1, -2048 -; RV32I-WITHFP-NEXT: sw t3, -32(s0) -; RV32I-WITHFP-NEXT: sw t2, -28(s0) -; RV32I-WITHFP-NEXT: sw t1, -24(s0) -; RV32I-WITHFP-NEXT: sw t0, -20(s0) ; RV32I-WITHFP-NEXT: call callee_aligned_stack ; RV32I-WITHFP-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; RV32I-WITHFP-NEXT: lw s1, 52(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: addi sp, sp, 64 ; RV32I-WITHFP-NEXT: ret %1 = call i32 @callee_aligned_stack(i32 1, i32 11, diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll index 18916dd69eb43..f54e86b497945 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll @@ -149,9 +149,9 @@ define i32 @caller_many_scalars() nounwind { ; RV32I-FPELIM-NEXT: li a3, 4 ; RV32I-FPELIM-NEXT: li a5, 5 ; RV32I-FPELIM-NEXT: li a6, 6 -; RV32I-FPELIM-NEXT: li a7, 7 ; RV32I-FPELIM-NEXT: sw zero, 0(sp) ; RV32I-FPELIM-NEXT: sw a4, 4(sp) +; RV32I-FPELIM-NEXT: li a7, 7 ; RV32I-FPELIM-NEXT: li a4, 0 ; RV32I-FPELIM-NEXT: call callee_many_scalars ; RV32I-FPELIM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -171,9 +171,9 @@ define i32 @caller_many_scalars() nounwind { ; RV32I-WITHFP-NEXT: li a3, 4 ; RV32I-WITHFP-NEXT: li a5, 5 ; RV32I-WITHFP-NEXT: li a6, 6 -; RV32I-WITHFP-NEXT: li a7, 7 ; RV32I-WITHFP-NEXT: sw zero, 0(sp) ; RV32I-WITHFP-NEXT: sw a4, 4(sp) +; RV32I-WITHFP-NEXT: li a7, 7 ; RV32I-WITHFP-NEXT: li a4, 0 ; RV32I-WITHFP-NEXT: call callee_many_scalars ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -194,17 +194,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind { ; RV32I-FPELIM-NEXT: lw a3, 4(a1) ; RV32I-FPELIM-NEXT: lw a4, 8(a1) ; RV32I-FPELIM-NEXT: lw a1, 12(a1) -; RV32I-FPELIM-NEXT: lw a5, 12(a0) +; RV32I-FPELIM-NEXT: lw a5, 0(a0) ; RV32I-FPELIM-NEXT: lw a6, 4(a0) ; RV32I-FPELIM-NEXT: lw a7, 8(a0) -; RV32I-FPELIM-NEXT: lw a0, 0(a0) -; RV32I-FPELIM-NEXT: xor a1, a5, a1 -; RV32I-FPELIM-NEXT: xor a3, a6, a3 -; RV32I-FPELIM-NEXT: xor a4, a7, a4 -; RV32I-FPELIM-NEXT: xor a0, a0, a2 -; RV32I-FPELIM-NEXT: or a1, a3, a1 -; RV32I-FPELIM-NEXT: or a0, a0, a4 -; RV32I-FPELIM-NEXT: or a0, a0, a1 +; RV32I-FPELIM-NEXT: lw a0, 12(a0) +; RV32I-FPELIM-NEXT: xor a0, a0, a1 +; RV32I-FPELIM-NEXT: xor a1, a6, a3 +; RV32I-FPELIM-NEXT: xor a3, a7, a4 +; RV32I-FPELIM-NEXT: xor a2, a5, a2 +; RV32I-FPELIM-NEXT: or a0, a1, a0 +; RV32I-FPELIM-NEXT: or a2, a2, a3 +; RV32I-FPELIM-NEXT: or a0, a2, a0 ; RV32I-FPELIM-NEXT: seqz a0, a0 ; RV32I-FPELIM-NEXT: ret ; @@ -218,17 +218,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind { ; RV32I-WITHFP-NEXT: lw a3, 4(a1) ; RV32I-WITHFP-NEXT: lw a4, 8(a1) ; RV32I-WITHFP-NEXT: lw a1, 12(a1) -; RV32I-WITHFP-NEXT: lw a5, 12(a0) +; RV32I-WITHFP-NEXT: lw a5, 0(a0) ; RV32I-WITHFP-NEXT: lw a6, 4(a0) ; RV32I-WITHFP-NEXT: lw a7, 8(a0) -; RV32I-WITHFP-NEXT: lw a0, 0(a0) -; RV32I-WITHFP-NEXT: xor a1, a5, a1 -; RV32I-WITHFP-NEXT: xor a3, a6, a3 -; RV32I-WITHFP-NEXT: xor a4, a7, a4 -; RV32I-WITHFP-NEXT: xor a0, a0, a2 -; RV32I-WITHFP-NEXT: or a1, a3, a1 -; RV32I-WITHFP-NEXT: or a0, a0, a4 -; RV32I-WITHFP-NEXT: or a0, a0, a1 +; RV32I-WITHFP-NEXT: lw a0, 12(a0) +; RV32I-WITHFP-NEXT: xor a0, a0, a1 +; RV32I-WITHFP-NEXT: xor a1, a6, a3 +; RV32I-WITHFP-NEXT: xor a3, a7, a4 +; RV32I-WITHFP-NEXT: xor a2, a5, a2 +; RV32I-WITHFP-NEXT: or a0, a1, a0 +; RV32I-WITHFP-NEXT: or a2, a2, a3 +; RV32I-WITHFP-NEXT: or a0, a2, a0 ; RV32I-WITHFP-NEXT: seqz a0, a0 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -245,18 +245,18 @@ define i32 @caller_large_scalars() nounwind { ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -48 ; RV32I-FPELIM-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32I-FPELIM-NEXT: lui a1, 524272 -; RV32I-FPELIM-NEXT: li a2, 1 -; RV32I-FPELIM-NEXT: addi a0, sp, 24 +; RV32I-FPELIM-NEXT: lui a0, 524272 +; RV32I-FPELIM-NEXT: li a1, 1 ; RV32I-FPELIM-NEXT: sw zero, 0(sp) ; RV32I-FPELIM-NEXT: sw zero, 4(sp) ; RV32I-FPELIM-NEXT: sw zero, 8(sp) -; RV32I-FPELIM-NEXT: sw a1, 12(sp) -; RV32I-FPELIM-NEXT: mv a1, sp -; RV32I-FPELIM-NEXT: sw a2, 24(sp) +; RV32I-FPELIM-NEXT: sw a0, 12(sp) +; RV32I-FPELIM-NEXT: addi a0, sp, 24 +; RV32I-FPELIM-NEXT: sw a1, 24(sp) ; RV32I-FPELIM-NEXT: sw zero, 28(sp) ; RV32I-FPELIM-NEXT: sw zero, 32(sp) ; RV32I-FPELIM-NEXT: sw zero, 36(sp) +; RV32I-FPELIM-NEXT: mv a1, sp ; RV32I-FPELIM-NEXT: call callee_large_scalars ; RV32I-FPELIM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-FPELIM-NEXT: addi sp, sp, 48 @@ -268,18 +268,18 @@ define i32 @caller_large_scalars() nounwind { ; RV32I-WITHFP-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: sw s0, 40(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 48 -; RV32I-WITHFP-NEXT: lui a1, 524272 -; RV32I-WITHFP-NEXT: li a2, 1 -; RV32I-WITHFP-NEXT: addi a0, s0, -24 +; RV32I-WITHFP-NEXT: lui a0, 524272 +; RV32I-WITHFP-NEXT: li a1, 1 ; RV32I-WITHFP-NEXT: sw zero, -48(s0) ; RV32I-WITHFP-NEXT: sw zero, -44(s0) ; RV32I-WITHFP-NEXT: sw zero, -40(s0) -; RV32I-WITHFP-NEXT: sw a1, -36(s0) -; RV32I-WITHFP-NEXT: addi a1, s0, -48 -; RV32I-WITHFP-NEXT: sw a2, -24(s0) +; RV32I-WITHFP-NEXT: sw a0, -36(s0) +; RV32I-WITHFP-NEXT: addi a0, s0, -24 +; RV32I-WITHFP-NEXT: sw a1, -24(s0) ; RV32I-WITHFP-NEXT: sw zero, -20(s0) ; RV32I-WITHFP-NEXT: sw zero, -16(s0) ; RV32I-WITHFP-NEXT: sw zero, -12(s0) +; RV32I-WITHFP-NEXT: addi a1, s0, -48 ; RV32I-WITHFP-NEXT: call callee_large_scalars ; RV32I-WITHFP-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 40(sp) # 4-byte Folded Reload @@ -301,17 +301,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; RV32I-FPELIM-NEXT: lw a2, 4(a7) ; RV32I-FPELIM-NEXT: lw a3, 8(a7) ; RV32I-FPELIM-NEXT: lw a4, 12(a7) -; RV32I-FPELIM-NEXT: lw a5, 12(a0) +; RV32I-FPELIM-NEXT: lw a5, 0(a0) ; RV32I-FPELIM-NEXT: lw a6, 4(a0) ; RV32I-FPELIM-NEXT: lw a7, 8(a0) -; RV32I-FPELIM-NEXT: lw a0, 0(a0) -; RV32I-FPELIM-NEXT: xor a4, a4, a5 +; RV32I-FPELIM-NEXT: lw a0, 12(a0) +; RV32I-FPELIM-NEXT: xor a0, a4, a0 ; RV32I-FPELIM-NEXT: xor a2, a2, a6 ; RV32I-FPELIM-NEXT: xor a3, a3, a7 -; RV32I-FPELIM-NEXT: xor a0, a1, a0 -; RV32I-FPELIM-NEXT: or a2, a2, a4 -; RV32I-FPELIM-NEXT: or a0, a0, a3 -; RV32I-FPELIM-NEXT: or a0, a0, a2 +; RV32I-FPELIM-NEXT: xor a1, a1, a5 +; RV32I-FPELIM-NEXT: or a0, a2, a0 +; RV32I-FPELIM-NEXT: or a1, a1, a3 +; RV32I-FPELIM-NEXT: or a0, a1, a0 ; RV32I-FPELIM-NEXT: seqz a0, a0 ; RV32I-FPELIM-NEXT: ret ; @@ -326,17 +326,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; RV32I-WITHFP-NEXT: lw a2, 4(a7) ; RV32I-WITHFP-NEXT: lw a3, 8(a7) ; RV32I-WITHFP-NEXT: lw a4, 12(a7) -; RV32I-WITHFP-NEXT: lw a5, 12(a0) +; RV32I-WITHFP-NEXT: lw a5, 0(a0) ; RV32I-WITHFP-NEXT: lw a6, 4(a0) ; RV32I-WITHFP-NEXT: lw a7, 8(a0) -; RV32I-WITHFP-NEXT: lw a0, 0(a0) -; RV32I-WITHFP-NEXT: xor a4, a4, a5 +; RV32I-WITHFP-NEXT: lw a0, 12(a0) +; RV32I-WITHFP-NEXT: xor a0, a4, a0 ; RV32I-WITHFP-NEXT: xor a2, a2, a6 ; RV32I-WITHFP-NEXT: xor a3, a3, a7 -; RV32I-WITHFP-NEXT: xor a0, a1, a0 -; RV32I-WITHFP-NEXT: or a2, a2, a4 -; RV32I-WITHFP-NEXT: or a0, a0, a3 -; RV32I-WITHFP-NEXT: or a0, a0, a2 +; RV32I-WITHFP-NEXT: xor a1, a1, a5 +; RV32I-WITHFP-NEXT: or a0, a2, a0 +; RV32I-WITHFP-NEXT: or a1, a1, a3 +; RV32I-WITHFP-NEXT: or a0, a1, a0 ; RV32I-WITHFP-NEXT: seqz a0, a0 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -353,28 +353,28 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind { ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -64 ; RV32I-FPELIM-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32I-FPELIM-NEXT: addi a6, sp, 16 -; RV32I-FPELIM-NEXT: li a7, 9 -; RV32I-FPELIM-NEXT: lui t0, 524272 -; RV32I-FPELIM-NEXT: li t1, 8 +; RV32I-FPELIM-NEXT: addi a5, sp, 16 +; RV32I-FPELIM-NEXT: li a6, 9 +; RV32I-FPELIM-NEXT: lui a7, 524272 +; RV32I-FPELIM-NEXT: li t0, 8 ; RV32I-FPELIM-NEXT: li a0, 1 ; RV32I-FPELIM-NEXT: li a1, 2 ; RV32I-FPELIM-NEXT: li a2, 3 ; RV32I-FPELIM-NEXT: li a3, 4 ; RV32I-FPELIM-NEXT: li a4, 5 +; RV32I-FPELIM-NEXT: sw a6, 0(sp) +; RV32I-FPELIM-NEXT: sw a5, 4(sp) ; RV32I-FPELIM-NEXT: li a5, 6 -; RV32I-FPELIM-NEXT: sw a7, 0(sp) -; RV32I-FPELIM-NEXT: sw a6, 4(sp) -; RV32I-FPELIM-NEXT: li a6, 7 ; RV32I-FPELIM-NEXT: sw zero, 16(sp) ; RV32I-FPELIM-NEXT: sw zero, 20(sp) ; RV32I-FPELIM-NEXT: sw zero, 24(sp) -; RV32I-FPELIM-NEXT: sw t0, 28(sp) -; RV32I-FPELIM-NEXT: addi a7, sp, 40 -; RV32I-FPELIM-NEXT: sw t1, 40(sp) +; RV32I-FPELIM-NEXT: sw a7, 28(sp) +; RV32I-FPELIM-NEXT: li a6, 7 +; RV32I-FPELIM-NEXT: sw t0, 40(sp) ; RV32I-FPELIM-NEXT: sw zero, 44(sp) ; RV32I-FPELIM-NEXT: sw zero, 48(sp) ; RV32I-FPELIM-NEXT: sw zero, 52(sp) +; RV32I-FPELIM-NEXT: addi a7, sp, 40 ; RV32I-FPELIM-NEXT: call callee_large_scalars_exhausted_regs ; RV32I-FPELIM-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; RV32I-FPELIM-NEXT: addi sp, sp, 64 @@ -386,28 +386,28 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind { ; RV32I-WITHFP-NEXT: sw ra, 60(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 64 -; RV32I-WITHFP-NEXT: addi a6, s0, -48 -; RV32I-WITHFP-NEXT: li a7, 9 -; RV32I-WITHFP-NEXT: lui t0, 524272 -; RV32I-WITHFP-NEXT: li t1, 8 +; RV32I-WITHFP-NEXT: addi a5, s0, -48 +; RV32I-WITHFP-NEXT: li a6, 9 +; RV32I-WITHFP-NEXT: lui a7, 524272 +; RV32I-WITHFP-NEXT: li t0, 8 ; RV32I-WITHFP-NEXT: li a0, 1 ; RV32I-WITHFP-NEXT: li a1, 2 ; RV32I-WITHFP-NEXT: li a2, 3 ; RV32I-WITHFP-NEXT: li a3, 4 ; RV32I-WITHFP-NEXT: li a4, 5 +; RV32I-WITHFP-NEXT: sw a6, 0(sp) +; RV32I-WITHFP-NEXT: sw a5, 4(sp) ; RV32I-WITHFP-NEXT: li a5, 6 -; RV32I-WITHFP-NEXT: sw a7, 0(sp) -; RV32I-WITHFP-NEXT: sw a6, 4(sp) -; RV32I-WITHFP-NEXT: li a6, 7 ; RV32I-WITHFP-NEXT: sw zero, -48(s0) ; RV32I-WITHFP-NEXT: sw zero, -44(s0) ; RV32I-WITHFP-NEXT: sw zero, -40(s0) -; RV32I-WITHFP-NEXT: sw t0, -36(s0) -; RV32I-WITHFP-NEXT: addi a7, s0, -24 -; RV32I-WITHFP-NEXT: sw t1, -24(s0) +; RV32I-WITHFP-NEXT: sw a7, -36(s0) +; RV32I-WITHFP-NEXT: li a6, 7 +; RV32I-WITHFP-NEXT: sw t0, -24(s0) ; RV32I-WITHFP-NEXT: sw zero, -20(s0) ; RV32I-WITHFP-NEXT: sw zero, -16(s0) ; RV32I-WITHFP-NEXT: sw zero, -12(s0) +; RV32I-WITHFP-NEXT: addi a7, s0, -24 ; RV32I-WITHFP-NEXT: call callee_large_scalars_exhausted_regs ; RV32I-WITHFP-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 56(sp) # 4-byte Folded Reload @@ -614,15 +614,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 % ; RV32I-FPELIM-LABEL: callee_aligned_stack: ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: lw a0, 0(a2) -; RV32I-FPELIM-NEXT: lw a1, 8(sp) -; RV32I-FPELIM-NEXT: lw a2, 0(sp) -; RV32I-FPELIM-NEXT: lw a3, 20(sp) +; RV32I-FPELIM-NEXT: lw a1, 20(sp) +; RV32I-FPELIM-NEXT: lw a2, 8(sp) +; RV32I-FPELIM-NEXT: lw a3, 0(sp) ; RV32I-FPELIM-NEXT: lw a4, 16(sp) ; RV32I-FPELIM-NEXT: add a0, a0, a7 -; RV32I-FPELIM-NEXT: add a1, a2, a1 +; RV32I-FPELIM-NEXT: add a2, a3, a2 +; RV32I-FPELIM-NEXT: add a0, a0, a2 +; RV32I-FPELIM-NEXT: add a1, a4, a1 ; RV32I-FPELIM-NEXT: add a0, a0, a1 -; RV32I-FPELIM-NEXT: add a3, a4, a3 -; RV32I-FPELIM-NEXT: add a0, a0, a3 ; RV32I-FPELIM-NEXT: ret ; ; RV32I-WITHFP-LABEL: callee_aligned_stack: @@ -632,15 +632,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 % ; RV32I-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 16 ; RV32I-WITHFP-NEXT: lw a0, 0(a2) -; RV32I-WITHFP-NEXT: lw a1, 8(s0) -; RV32I-WITHFP-NEXT: lw a2, 0(s0) -; RV32I-WITHFP-NEXT: lw a3, 20(s0) +; RV32I-WITHFP-NEXT: lw a1, 20(s0) +; RV32I-WITHFP-NEXT: lw a2, 8(s0) +; RV32I-WITHFP-NEXT: lw a3, 0(s0) ; RV32I-WITHFP-NEXT: lw a4, 16(s0) ; RV32I-WITHFP-NEXT: add a0, a0, a7 -; RV32I-WITHFP-NEXT: add a1, a2, a1 +; RV32I-WITHFP-NEXT: add a2, a3, a2 +; RV32I-WITHFP-NEXT: add a0, a0, a2 +; RV32I-WITHFP-NEXT: add a1, a4, a1 ; RV32I-WITHFP-NEXT: add a0, a0, a1 -; RV32I-WITHFP-NEXT: add a3, a4, a3 -; RV32I-WITHFP-NEXT: add a0, a0, a3 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: addi sp, sp, 16 @@ -664,38 +664,38 @@ define void @caller_aligned_stack() nounwind { ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -64 ; RV32I-FPELIM-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32I-FPELIM-NEXT: li a5, 19 -; RV32I-FPELIM-NEXT: li a6, 18 -; RV32I-FPELIM-NEXT: li a7, 17 -; RV32I-FPELIM-NEXT: li t0, 16 -; RV32I-FPELIM-NEXT: li t1, 15 -; RV32I-FPELIM-NEXT: lui t2, 262153 -; RV32I-FPELIM-NEXT: lui t3, 545260 -; RV32I-FPELIM-NEXT: lui t4, 964690 -; RV32I-FPELIM-NEXT: lui t5, 335544 -; RV32I-FPELIM-NEXT: lui t6, 688509 +; RV32I-FPELIM-NEXT: li a4, 19 +; RV32I-FPELIM-NEXT: li a5, 18 +; RV32I-FPELIM-NEXT: li a6, 17 +; RV32I-FPELIM-NEXT: li a7, 16 +; RV32I-FPELIM-NEXT: li t0, 15 +; RV32I-FPELIM-NEXT: lui t1, 262153 +; RV32I-FPELIM-NEXT: lui t2, 545260 +; RV32I-FPELIM-NEXT: lui t3, 964690 +; RV32I-FPELIM-NEXT: lui t4, 335544 +; RV32I-FPELIM-NEXT: lui t5, 688509 ; RV32I-FPELIM-NEXT: li a0, 1 ; RV32I-FPELIM-NEXT: li a1, 11 ; RV32I-FPELIM-NEXT: addi a2, sp, 32 ; RV32I-FPELIM-NEXT: li a3, 12 +; RV32I-FPELIM-NEXT: sw a5, 20(sp) +; RV32I-FPELIM-NEXT: sw a4, 24(sp) ; RV32I-FPELIM-NEXT: li a4, 13 -; RV32I-FPELIM-NEXT: sw a6, 20(sp) -; RV32I-FPELIM-NEXT: sw a5, 24(sp) -; RV32I-FPELIM-NEXT: li a6, 4 -; RV32I-FPELIM-NEXT: sw t1, 0(sp) -; RV32I-FPELIM-NEXT: sw t0, 8(sp) +; RV32I-FPELIM-NEXT: sw t0, 0(sp) +; RV32I-FPELIM-NEXT: sw a7, 8(sp) ; RV32I-FPELIM-NEXT: sw zero, 12(sp) -; RV32I-FPELIM-NEXT: sw a7, 16(sp) +; RV32I-FPELIM-NEXT: sw a6, 16(sp) +; RV32I-FPELIM-NEXT: li a6, 4 +; RV32I-FPELIM-NEXT: addi a7, t1, 491 +; RV32I-FPELIM-NEXT: addi t0, t2, -1967 +; RV32I-FPELIM-NEXT: addi t1, t3, -328 +; RV32I-FPELIM-NEXT: addi t2, t4, 1311 +; RV32I-FPELIM-NEXT: addi a5, t5, -2048 +; RV32I-FPELIM-NEXT: sw t2, 32(sp) +; RV32I-FPELIM-NEXT: sw t1, 36(sp) +; RV32I-FPELIM-NEXT: sw t0, 40(sp) +; RV32I-FPELIM-NEXT: sw a7, 44(sp) ; RV32I-FPELIM-NEXT: li a7, 14 -; RV32I-FPELIM-NEXT: addi t0, t2, 491 -; RV32I-FPELIM-NEXT: addi t1, t3, -1967 -; RV32I-FPELIM-NEXT: addi t2, t4, -328 -; RV32I-FPELIM-NEXT: addi t3, t5, 1311 -; RV32I-FPELIM-NEXT: addi a5, t6, -2048 -; RV32I-FPELIM-NEXT: sw t3, 32(sp) -; RV32I-FPELIM-NEXT: sw t2, 36(sp) -; RV32I-FPELIM-NEXT: sw t1, 40(sp) -; RV32I-FPELIM-NEXT: sw t0, 44(sp) ; RV32I-FPELIM-NEXT: call callee_aligned_stack ; RV32I-FPELIM-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; RV32I-FPELIM-NEXT: addi sp, sp, 64 @@ -707,38 +707,38 @@ define void @caller_aligned_stack() nounwind { ; RV32I-WITHFP-NEXT: sw ra, 60(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 64 -; RV32I-WITHFP-NEXT: li a5, 19 -; RV32I-WITHFP-NEXT: li a6, 18 -; RV32I-WITHFP-NEXT: li a7, 17 -; RV32I-WITHFP-NEXT: li t0, 16 -; RV32I-WITHFP-NEXT: li t1, 15 -; RV32I-WITHFP-NEXT: lui t2, 262153 -; RV32I-WITHFP-NEXT: lui t3, 545260 -; RV32I-WITHFP-NEXT: lui t4, 964690 -; RV32I-WITHFP-NEXT: lui t5, 335544 -; RV32I-WITHFP-NEXT: lui t6, 688509 +; RV32I-WITHFP-NEXT: li a4, 19 +; RV32I-WITHFP-NEXT: li a5, 18 +; RV32I-WITHFP-NEXT: li a6, 17 +; RV32I-WITHFP-NEXT: li a7, 16 +; RV32I-WITHFP-NEXT: li t0, 15 +; RV32I-WITHFP-NEXT: lui t1, 262153 +; RV32I-WITHFP-NEXT: lui t2, 545260 +; RV32I-WITHFP-NEXT: lui t3, 964690 +; RV32I-WITHFP-NEXT: lui t4, 335544 +; RV32I-WITHFP-NEXT: lui t5, 688509 ; RV32I-WITHFP-NEXT: li a0, 1 ; RV32I-WITHFP-NEXT: li a1, 11 ; RV32I-WITHFP-NEXT: addi a2, s0, -32 ; RV32I-WITHFP-NEXT: li a3, 12 +; RV32I-WITHFP-NEXT: sw a5, 20(sp) +; RV32I-WITHFP-NEXT: sw a4, 24(sp) ; RV32I-WITHFP-NEXT: li a4, 13 -; RV32I-WITHFP-NEXT: sw a6, 20(sp) -; RV32I-WITHFP-NEXT: sw a5, 24(sp) -; RV32I-WITHFP-NEXT: li a6, 4 -; RV32I-WITHFP-NEXT: sw t1, 0(sp) -; RV32I-WITHFP-NEXT: sw t0, 8(sp) +; RV32I-WITHFP-NEXT: sw t0, 0(sp) +; RV32I-WITHFP-NEXT: sw a7, 8(sp) ; RV32I-WITHFP-NEXT: sw zero, 12(sp) -; RV32I-WITHFP-NEXT: sw a7, 16(sp) +; RV32I-WITHFP-NEXT: sw a6, 16(sp) +; RV32I-WITHFP-NEXT: li a6, 4 +; RV32I-WITHFP-NEXT: addi a7, t1, 491 +; RV32I-WITHFP-NEXT: addi t0, t2, -1967 +; RV32I-WITHFP-NEXT: addi t1, t3, -328 +; RV32I-WITHFP-NEXT: addi t2, t4, 1311 +; RV32I-WITHFP-NEXT: addi a5, t5, -2048 +; RV32I-WITHFP-NEXT: sw t2, -32(s0) +; RV32I-WITHFP-NEXT: sw t1, -28(s0) +; RV32I-WITHFP-NEXT: sw t0, -24(s0) +; RV32I-WITHFP-NEXT: sw a7, -20(s0) ; RV32I-WITHFP-NEXT: li a7, 14 -; RV32I-WITHFP-NEXT: addi t0, t2, 491 -; RV32I-WITHFP-NEXT: addi t1, t3, -1967 -; RV32I-WITHFP-NEXT: addi t2, t4, -328 -; RV32I-WITHFP-NEXT: addi t3, t5, 1311 -; RV32I-WITHFP-NEXT: addi a5, t6, -2048 -; RV32I-WITHFP-NEXT: sw t3, -32(s0) -; RV32I-WITHFP-NEXT: sw t2, -28(s0) -; RV32I-WITHFP-NEXT: sw t1, -24(s0) -; RV32I-WITHFP-NEXT: sw t0, -20(s0) ; RV32I-WITHFP-NEXT: call callee_aligned_stack ; RV32I-WITHFP-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 56(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32.ll index 1dac139503ba7..5e37c83d30ba8 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32.ll @@ -111,8 +111,8 @@ define i32 @caller_float_on_stack() nounwind { ; RV32I-FPELIM-NEXT: li a0, 1 ; RV32I-FPELIM-NEXT: li a2, 2 ; RV32I-FPELIM-NEXT: li a4, 3 -; RV32I-FPELIM-NEXT: li a6, 4 ; RV32I-FPELIM-NEXT: sw a1, 0(sp) +; RV32I-FPELIM-NEXT: li a6, 4 ; RV32I-FPELIM-NEXT: li a1, 0 ; RV32I-FPELIM-NEXT: li a3, 0 ; RV32I-FPELIM-NEXT: li a5, 0 @@ -132,8 +132,8 @@ define i32 @caller_float_on_stack() nounwind { ; RV32I-WITHFP-NEXT: li a0, 1 ; RV32I-WITHFP-NEXT: li a2, 2 ; RV32I-WITHFP-NEXT: li a4, 3 -; RV32I-WITHFP-NEXT: li a6, 4 ; RV32I-WITHFP-NEXT: sw a1, 0(sp) +; RV32I-WITHFP-NEXT: li a6, 4 ; RV32I-WITHFP-NEXT: li a1, 0 ; RV32I-WITHFP-NEXT: li a3, 0 ; RV32I-WITHFP-NEXT: li a5, 0 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll index 7630d5b8f77ef..3ae76de6a65f7 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll @@ -51,14 +51,14 @@ define i32 @caller_double_in_fpr_exhausted_gprs() nounwind { ; RV32-ILP32D: # %bb.0: ; RV32-ILP32D-NEXT: addi sp, sp, -16 ; RV32-ILP32D-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ILP32D-NEXT: li a1, 5 -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI3_0) -; RV32-ILP32D-NEXT: fld fa0, %lo(.LCPI3_0)(a0) +; RV32-ILP32D-NEXT: li a0, 5 +; RV32-ILP32D-NEXT: lui a1, %hi(.LCPI3_0) +; RV32-ILP32D-NEXT: sw a0, 0(sp) +; RV32-ILP32D-NEXT: fld fa0, %lo(.LCPI3_0)(a1) ; RV32-ILP32D-NEXT: li a0, 1 ; RV32-ILP32D-NEXT: li a2, 2 ; RV32-ILP32D-NEXT: li a4, 3 ; RV32-ILP32D-NEXT: li a6, 4 -; RV32-ILP32D-NEXT: sw a1, 0(sp) ; RV32-ILP32D-NEXT: li a1, 0 ; RV32-ILP32D-NEXT: li a3, 0 ; RV32-ILP32D-NEXT: li a5, 0 @@ -147,16 +147,17 @@ define i32 @caller_double_in_gpr_and_stack_almost_exhausted_gprs_fprs() nounwind ; RV32-ILP32D: # %bb.0: ; RV32-ILP32D-NEXT: addi sp, sp, -16 ; RV32-ILP32D-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ILP32D-NEXT: lui a1, 262816 -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI7_0) +; RV32-ILP32D-NEXT: lui a0, 262816 +; RV32-ILP32D-NEXT: lui a1, %hi(.LCPI7_0) ; RV32-ILP32D-NEXT: lui a2, %hi(.LCPI7_1) ; RV32-ILP32D-NEXT: lui a3, %hi(.LCPI7_2) ; RV32-ILP32D-NEXT: lui a4, %hi(.LCPI7_3) ; RV32-ILP32D-NEXT: lui a5, %hi(.LCPI7_4) ; RV32-ILP32D-NEXT: lui a6, %hi(.LCPI7_5) ; RV32-ILP32D-NEXT: lui a7, %hi(.LCPI7_6) -; RV32-ILP32D-NEXT: fld fa0, %lo(.LCPI7_0)(a0) +; RV32-ILP32D-NEXT: sw a0, 0(sp) ; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI7_7) +; RV32-ILP32D-NEXT: fld fa0, %lo(.LCPI7_0)(a1) ; RV32-ILP32D-NEXT: fld fa1, %lo(.LCPI7_1)(a2) ; RV32-ILP32D-NEXT: fld fa2, %lo(.LCPI7_2)(a3) ; RV32-ILP32D-NEXT: fld fa3, %lo(.LCPI7_3)(a4) @@ -168,7 +169,6 @@ define i32 @caller_double_in_gpr_and_stack_almost_exhausted_gprs_fprs() nounwind ; RV32-ILP32D-NEXT: li a2, 3 ; RV32-ILP32D-NEXT: li a4, 5 ; RV32-ILP32D-NEXT: li a6, 7 -; RV32-ILP32D-NEXT: sw a1, 0(sp) ; RV32-ILP32D-NEXT: li a1, 0 ; RV32-ILP32D-NEXT: li a3, 0 ; RV32-ILP32D-NEXT: li a5, 0 @@ -203,29 +203,29 @@ define i32 @caller_double_on_stack_exhausted_gprs_fprs() nounwind { ; RV32-ILP32D: # %bb.0: ; RV32-ILP32D-NEXT: addi sp, sp, -16 ; RV32-ILP32D-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ILP32D-NEXT: lui a1, 262816 -; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI9_0) +; RV32-ILP32D-NEXT: lui a0, 262816 +; RV32-ILP32D-NEXT: lui a1, %hi(.LCPI9_0) ; RV32-ILP32D-NEXT: lui a2, %hi(.LCPI9_1) ; RV32-ILP32D-NEXT: lui a3, %hi(.LCPI9_2) ; RV32-ILP32D-NEXT: lui a4, %hi(.LCPI9_3) ; RV32-ILP32D-NEXT: lui a5, %hi(.LCPI9_4) ; RV32-ILP32D-NEXT: lui a6, %hi(.LCPI9_5) ; RV32-ILP32D-NEXT: lui a7, %hi(.LCPI9_6) -; RV32-ILP32D-NEXT: fld fa0, %lo(.LCPI9_0)(a0) -; RV32-ILP32D-NEXT: lui t0, %hi(.LCPI9_7) +; RV32-ILP32D-NEXT: sw zero, 0(sp) +; RV32-ILP32D-NEXT: sw a0, 4(sp) +; RV32-ILP32D-NEXT: lui a0, %hi(.LCPI9_7) +; RV32-ILP32D-NEXT: fld fa0, %lo(.LCPI9_0)(a1) ; RV32-ILP32D-NEXT: fld fa1, %lo(.LCPI9_1)(a2) -; RV32-ILP32D-NEXT: li a0, 1 ; RV32-ILP32D-NEXT: fld fa2, %lo(.LCPI9_2)(a3) ; RV32-ILP32D-NEXT: fld fa3, %lo(.LCPI9_3)(a4) ; RV32-ILP32D-NEXT: fld fa4, %lo(.LCPI9_4)(a5) ; RV32-ILP32D-NEXT: fld fa5, %lo(.LCPI9_5)(a6) ; RV32-ILP32D-NEXT: fld fa6, %lo(.LCPI9_6)(a7) -; RV32-ILP32D-NEXT: fld fa7, %lo(.LCPI9_7)(t0) +; RV32-ILP32D-NEXT: fld fa7, %lo(.LCPI9_7)(a0) +; RV32-ILP32D-NEXT: li a0, 1 ; RV32-ILP32D-NEXT: li a2, 3 ; RV32-ILP32D-NEXT: li a4, 5 ; RV32-ILP32D-NEXT: li a6, 7 -; RV32-ILP32D-NEXT: sw zero, 0(sp) -; RV32-ILP32D-NEXT: sw a1, 4(sp) ; RV32-ILP32D-NEXT: li a1, 0 ; RV32-ILP32D-NEXT: li a3, 0 ; RV32-ILP32D-NEXT: li a5, 0 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll index e16bed5400300..51def89ed6c3a 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll @@ -224,10 +224,10 @@ define i32 @caller_float_on_stack() { ; ILP32E-FPELIM-NEXT: li a3, 4 ; ILP32E-FPELIM-NEXT: li a0, 1 ; ILP32E-FPELIM-NEXT: li a2, 2 -; ILP32E-FPELIM-NEXT: li a4, 3 ; ILP32E-FPELIM-NEXT: sw a3, 0(sp) ; ILP32E-FPELIM-NEXT: sw zero, 4(sp) ; ILP32E-FPELIM-NEXT: sw a1, 8(sp) +; ILP32E-FPELIM-NEXT: li a4, 3 ; ILP32E-FPELIM-NEXT: li a1, 0 ; ILP32E-FPELIM-NEXT: li a3, 0 ; ILP32E-FPELIM-NEXT: li a5, 0 @@ -252,10 +252,10 @@ define i32 @caller_float_on_stack() { ; ILP32E-WITHFP-NEXT: li a3, 4 ; ILP32E-WITHFP-NEXT: li a0, 1 ; ILP32E-WITHFP-NEXT: li a2, 2 -; ILP32E-WITHFP-NEXT: li a4, 3 ; ILP32E-WITHFP-NEXT: sw a3, 0(sp) ; ILP32E-WITHFP-NEXT: sw zero, 4(sp) ; ILP32E-WITHFP-NEXT: sw a1, 8(sp) +; ILP32E-WITHFP-NEXT: li a4, 3 ; ILP32E-WITHFP-NEXT: li a1, 0 ; ILP32E-WITHFP-NEXT: li a3, 0 ; ILP32E-WITHFP-NEXT: li a5, 0 @@ -280,10 +280,10 @@ define i32 @caller_float_on_stack() { ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 4 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a2, 2 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 3 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a3, 0(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 4(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a1, 8(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 3 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a1, 0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 0 @@ -306,10 +306,10 @@ define i32 @caller_float_on_stack() { ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 4 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a2, 2 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 3 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a3, 0(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 4(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a1, 8(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 3 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a1, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 0 @@ -589,16 +589,16 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 % ; ILP32E-FPELIM-LABEL: callee_aligned_stack: ; ILP32E-FPELIM: # %bb.0: ; ILP32E-FPELIM-NEXT: lw a0, 0(a2) -; ILP32E-FPELIM-NEXT: lw a1, 12(sp) -; ILP32E-FPELIM-NEXT: lw a2, 4(sp) +; ILP32E-FPELIM-NEXT: lw a1, 24(sp) +; ILP32E-FPELIM-NEXT: lw a2, 12(sp) ; ILP32E-FPELIM-NEXT: lw a3, 8(sp) -; ILP32E-FPELIM-NEXT: lw a4, 24(sp) +; ILP32E-FPELIM-NEXT: lw a4, 4(sp) ; ILP32E-FPELIM-NEXT: lw a5, 20(sp) +; ILP32E-FPELIM-NEXT: add a0, a0, a4 +; ILP32E-FPELIM-NEXT: add a2, a3, a2 ; ILP32E-FPELIM-NEXT: add a0, a0, a2 -; ILP32E-FPELIM-NEXT: add a1, a3, a1 +; ILP32E-FPELIM-NEXT: add a1, a5, a1 ; ILP32E-FPELIM-NEXT: add a0, a0, a1 -; ILP32E-FPELIM-NEXT: add a4, a5, a4 -; ILP32E-FPELIM-NEXT: add a0, a0, a4 ; ILP32E-FPELIM-NEXT: ret ; ; ILP32E-WITHFP-LABEL: callee_aligned_stack: @@ -612,16 +612,16 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 % ; ILP32E-WITHFP-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: lw a0, 0(a2) -; ILP32E-WITHFP-NEXT: lw a1, 12(s0) -; ILP32E-WITHFP-NEXT: lw a2, 4(s0) +; ILP32E-WITHFP-NEXT: lw a1, 24(s0) +; ILP32E-WITHFP-NEXT: lw a2, 12(s0) ; ILP32E-WITHFP-NEXT: lw a3, 8(s0) -; ILP32E-WITHFP-NEXT: lw a4, 24(s0) +; ILP32E-WITHFP-NEXT: lw a4, 4(s0) ; ILP32E-WITHFP-NEXT: lw a5, 20(s0) +; ILP32E-WITHFP-NEXT: add a0, a0, a4 +; ILP32E-WITHFP-NEXT: add a2, a3, a2 ; ILP32E-WITHFP-NEXT: add a0, a0, a2 -; ILP32E-WITHFP-NEXT: add a1, a3, a1 +; ILP32E-WITHFP-NEXT: add a1, a5, a1 ; ILP32E-WITHFP-NEXT: add a0, a0, a1 -; ILP32E-WITHFP-NEXT: add a4, a5, a4 -; ILP32E-WITHFP-NEXT: add a0, a0, a4 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa sp, 8 ; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload ; ILP32E-WITHFP-NEXT: lw s0, 0(sp) # 4-byte Folded Reload @@ -634,16 +634,16 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 % ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_aligned_stack: ; ILP32E-FPELIM-SAVE-RESTORE: # %bb.0: ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a0, 0(a2) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 12(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a2, 4(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 24(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a2, 12(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a3, 8(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a4, 24(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a4, 4(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a5, 20(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a4 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a2, a3, a2 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a2 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a1, a3, a1 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a1, a5, a1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a1 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a4, a5, a4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: add a0, a0, a4 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: ret ; ; ILP32E-WITHFP-SAVE-RESTORE-LABEL: callee_aligned_stack: @@ -655,16 +655,16 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 % ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi s0, sp, 8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a0, 0(a2) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 12(s0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a2, 4(s0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 24(s0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a2, 12(s0) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a3, 8(s0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a4, 24(s0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a4, 4(s0) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a5, 20(s0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a4 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a2, a3, a2 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a2 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a1, a3, a1 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a1, a5, a1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a1 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a4, a5, a4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: add a0, a0, a4 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: tail __riscv_restore_1 %1 = bitcast fp128 %c to i128 @@ -694,43 +694,43 @@ define void @caller_aligned_stack() { ; ILP32E-FPELIM-NEXT: addi s0, sp, 64 ; ILP32E-FPELIM-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-NEXT: li a3, 18 -; ILP32E-FPELIM-NEXT: li a4, 17 -; ILP32E-FPELIM-NEXT: li a5, 16 -; ILP32E-FPELIM-NEXT: lui a6, 262236 -; ILP32E-FPELIM-NEXT: lui a7, 377487 -; ILP32E-FPELIM-NEXT: li t0, 15 -; ILP32E-FPELIM-NEXT: li t1, 14 -; ILP32E-FPELIM-NEXT: li t2, 4 -; ILP32E-FPELIM-NEXT: lui t3, 262153 -; ILP32E-FPELIM-NEXT: lui t4, 545260 -; ILP32E-FPELIM-NEXT: lui t5, 964690 -; ILP32E-FPELIM-NEXT: lui t6, 335544 -; ILP32E-FPELIM-NEXT: lui s2, 688509 +; ILP32E-FPELIM-NEXT: li a2, 18 +; ILP32E-FPELIM-NEXT: li a3, 17 +; ILP32E-FPELIM-NEXT: li a4, 16 +; ILP32E-FPELIM-NEXT: lui a5, 262236 +; ILP32E-FPELIM-NEXT: lui a6, 377487 +; ILP32E-FPELIM-NEXT: li a7, 15 +; ILP32E-FPELIM-NEXT: li t0, 14 +; ILP32E-FPELIM-NEXT: li t1, 4 +; ILP32E-FPELIM-NEXT: lui t2, 262153 +; ILP32E-FPELIM-NEXT: lui t3, 545260 +; ILP32E-FPELIM-NEXT: lui t4, 964690 +; ILP32E-FPELIM-NEXT: lui t5, 335544 +; ILP32E-FPELIM-NEXT: lui t6, 688509 ; ILP32E-FPELIM-NEXT: li a0, 1 ; ILP32E-FPELIM-NEXT: li a1, 11 +; ILP32E-FPELIM-NEXT: addi a5, a5, 655 +; ILP32E-FPELIM-NEXT: sw a5, 16(sp) +; ILP32E-FPELIM-NEXT: sw a4, 20(sp) +; ILP32E-FPELIM-NEXT: sw a3, 24(sp) +; ILP32E-FPELIM-NEXT: sw a2, 28(sp) ; ILP32E-FPELIM-NEXT: addi a2, sp, 32 -; ILP32E-FPELIM-NEXT: addi a6, a6, 655 -; ILP32E-FPELIM-NEXT: sw a6, 16(sp) -; ILP32E-FPELIM-NEXT: sw a5, 20(sp) -; ILP32E-FPELIM-NEXT: sw a4, 24(sp) -; ILP32E-FPELIM-NEXT: sw a3, 28(sp) +; ILP32E-FPELIM-NEXT: addi a3, a6, 1475 +; ILP32E-FPELIM-NEXT: sw t1, 0(sp) +; ILP32E-FPELIM-NEXT: sw t0, 4(sp) +; ILP32E-FPELIM-NEXT: sw a7, 8(sp) +; ILP32E-FPELIM-NEXT: sw a3, 12(sp) ; ILP32E-FPELIM-NEXT: li a3, 12 -; ILP32E-FPELIM-NEXT: addi a4, a7, 1475 -; ILP32E-FPELIM-NEXT: sw t2, 0(sp) -; ILP32E-FPELIM-NEXT: sw t1, 4(sp) -; ILP32E-FPELIM-NEXT: sw t0, 8(sp) -; ILP32E-FPELIM-NEXT: sw a4, 12(sp) +; ILP32E-FPELIM-NEXT: addi a4, t2, 491 +; ILP32E-FPELIM-NEXT: addi a6, t3, -1967 +; ILP32E-FPELIM-NEXT: addi a7, t4, -328 +; ILP32E-FPELIM-NEXT: addi t0, t5, 1311 +; ILP32E-FPELIM-NEXT: addi a5, t6, -2048 +; ILP32E-FPELIM-NEXT: sw t0, 32(sp) +; ILP32E-FPELIM-NEXT: sw a7, 36(sp) +; ILP32E-FPELIM-NEXT: sw a6, 40(sp) +; ILP32E-FPELIM-NEXT: sw a4, 44(sp) ; ILP32E-FPELIM-NEXT: li a4, 13 -; ILP32E-FPELIM-NEXT: addi a6, t3, 491 -; ILP32E-FPELIM-NEXT: addi a7, t4, -1967 -; ILP32E-FPELIM-NEXT: addi t0, t5, -328 -; ILP32E-FPELIM-NEXT: addi t1, t6, 1311 -; ILP32E-FPELIM-NEXT: addi a5, s2, -2048 -; ILP32E-FPELIM-NEXT: sw t1, 32(sp) -; ILP32E-FPELIM-NEXT: sw t0, 36(sp) -; ILP32E-FPELIM-NEXT: sw a7, 40(sp) -; ILP32E-FPELIM-NEXT: sw a6, 44(sp) ; ILP32E-FPELIM-NEXT: call callee_aligned_stack ; ILP32E-FPELIM-NEXT: addi sp, s0, -64 ; ILP32E-FPELIM-NEXT: .cfi_def_cfa sp, 64 @@ -753,43 +753,43 @@ define void @caller_aligned_stack() { ; ILP32E-WITHFP-NEXT: addi s0, sp, 64 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-NEXT: li a3, 18 -; ILP32E-WITHFP-NEXT: li a4, 17 -; ILP32E-WITHFP-NEXT: li a5, 16 -; ILP32E-WITHFP-NEXT: lui a6, 262236 -; ILP32E-WITHFP-NEXT: lui a7, 377487 -; ILP32E-WITHFP-NEXT: li t0, 15 -; ILP32E-WITHFP-NEXT: li t1, 14 -; ILP32E-WITHFP-NEXT: li t2, 4 -; ILP32E-WITHFP-NEXT: lui t3, 262153 -; ILP32E-WITHFP-NEXT: lui t4, 545260 -; ILP32E-WITHFP-NEXT: lui t5, 964690 -; ILP32E-WITHFP-NEXT: lui t6, 335544 -; ILP32E-WITHFP-NEXT: lui s2, 688509 +; ILP32E-WITHFP-NEXT: li a2, 18 +; ILP32E-WITHFP-NEXT: li a3, 17 +; ILP32E-WITHFP-NEXT: li a4, 16 +; ILP32E-WITHFP-NEXT: lui a5, 262236 +; ILP32E-WITHFP-NEXT: lui a6, 377487 +; ILP32E-WITHFP-NEXT: li a7, 15 +; ILP32E-WITHFP-NEXT: li t0, 14 +; ILP32E-WITHFP-NEXT: li t1, 4 +; ILP32E-WITHFP-NEXT: lui t2, 262153 +; ILP32E-WITHFP-NEXT: lui t3, 545260 +; ILP32E-WITHFP-NEXT: lui t4, 964690 +; ILP32E-WITHFP-NEXT: lui t5, 335544 +; ILP32E-WITHFP-NEXT: lui t6, 688509 ; ILP32E-WITHFP-NEXT: li a0, 1 ; ILP32E-WITHFP-NEXT: li a1, 11 +; ILP32E-WITHFP-NEXT: addi a5, a5, 655 +; ILP32E-WITHFP-NEXT: sw a5, 16(sp) +; ILP32E-WITHFP-NEXT: sw a4, 20(sp) +; ILP32E-WITHFP-NEXT: sw a3, 24(sp) +; ILP32E-WITHFP-NEXT: sw a2, 28(sp) ; ILP32E-WITHFP-NEXT: addi a2, sp, 32 -; ILP32E-WITHFP-NEXT: addi a6, a6, 655 -; ILP32E-WITHFP-NEXT: sw a6, 16(sp) -; ILP32E-WITHFP-NEXT: sw a5, 20(sp) -; ILP32E-WITHFP-NEXT: sw a4, 24(sp) -; ILP32E-WITHFP-NEXT: sw a3, 28(sp) +; ILP32E-WITHFP-NEXT: addi a3, a6, 1475 +; ILP32E-WITHFP-NEXT: sw t1, 0(sp) +; ILP32E-WITHFP-NEXT: sw t0, 4(sp) +; ILP32E-WITHFP-NEXT: sw a7, 8(sp) +; ILP32E-WITHFP-NEXT: sw a3, 12(sp) ; ILP32E-WITHFP-NEXT: li a3, 12 -; ILP32E-WITHFP-NEXT: addi a4, a7, 1475 -; ILP32E-WITHFP-NEXT: sw t2, 0(sp) -; ILP32E-WITHFP-NEXT: sw t1, 4(sp) -; ILP32E-WITHFP-NEXT: sw t0, 8(sp) -; ILP32E-WITHFP-NEXT: sw a4, 12(sp) +; ILP32E-WITHFP-NEXT: addi a4, t2, 491 +; ILP32E-WITHFP-NEXT: addi a6, t3, -1967 +; ILP32E-WITHFP-NEXT: addi a7, t4, -328 +; ILP32E-WITHFP-NEXT: addi t0, t5, 1311 +; ILP32E-WITHFP-NEXT: addi a5, t6, -2048 +; ILP32E-WITHFP-NEXT: sw t0, 32(sp) +; ILP32E-WITHFP-NEXT: sw a7, 36(sp) +; ILP32E-WITHFP-NEXT: sw a6, 40(sp) +; ILP32E-WITHFP-NEXT: sw a4, 44(sp) ; ILP32E-WITHFP-NEXT: li a4, 13 -; ILP32E-WITHFP-NEXT: addi a6, t3, 491 -; ILP32E-WITHFP-NEXT: addi a7, t4, -1967 -; ILP32E-WITHFP-NEXT: addi t0, t5, -328 -; ILP32E-WITHFP-NEXT: addi t1, t6, 1311 -; ILP32E-WITHFP-NEXT: addi a5, s2, -2048 -; ILP32E-WITHFP-NEXT: sw t1, 32(sp) -; ILP32E-WITHFP-NEXT: sw t0, 36(sp) -; ILP32E-WITHFP-NEXT: sw a7, 40(sp) -; ILP32E-WITHFP-NEXT: sw a6, 44(sp) ; ILP32E-WITHFP-NEXT: call callee_aligned_stack ; ILP32E-WITHFP-NEXT: addi sp, s0, -64 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa sp, 64 @@ -812,43 +812,43 @@ define void @caller_aligned_stack() { ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi s0, sp, 64 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 18 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 17 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 16 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a6, 262236 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a7, 377487 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t0, 15 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t1, 14 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t2, 4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t3, 262153 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t4, 545260 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t5, 964690 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t6, 335544 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui s2, 688509 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a2, 18 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 17 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 16 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a5, 262236 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a6, 377487 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a7, 15 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t0, 14 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t1, 4 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t2, 262153 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t3, 545260 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t4, 964690 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t5, 335544 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t6, 688509 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a1, 11 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a5, a5, 655 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a5, 16(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a4, 20(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a3, 24(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a2, 28(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a2, sp, 32 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a6, a6, 655 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a6, 16(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a5, 20(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a4, 24(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a3, 28(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a3, a6, 1475 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t1, 0(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t0, 4(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a7, 8(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a3, 12(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 12 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a4, a7, 1475 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t2, 0(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t1, 4(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t0, 8(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a4, 12(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a4, t2, 491 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a6, t3, -1967 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a7, t4, -328 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi t0, t5, 1311 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a5, t6, -2048 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t0, 32(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a7, 36(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a6, 40(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a4, 44(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 13 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a6, t3, 491 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a7, t4, -1967 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi t0, t5, -328 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi t1, t6, 1311 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a5, s2, -2048 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t1, 32(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t0, 36(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a7, 40(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a6, 44(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: call callee_aligned_stack ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi sp, s0, -64 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 64 @@ -867,43 +867,43 @@ define void @caller_aligned_stack() { ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi s0, sp, 64 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 18 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 17 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 16 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a6, 262236 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a7, 377487 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t0, 15 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t1, 14 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t2, 4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t3, 262153 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t4, 545260 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t5, 964690 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t6, 335544 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui s2, 688509 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a2, 18 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 17 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 16 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a5, 262236 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a6, 377487 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a7, 15 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t0, 14 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t1, 4 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t2, 262153 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t3, 545260 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t4, 964690 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t5, 335544 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t6, 688509 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a1, 11 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a5, a5, 655 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a5, 16(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a4, 20(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a3, 24(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a2, 28(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a2, sp, 32 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a6, a6, 655 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a6, 16(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a5, 20(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a4, 24(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a3, 28(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a3, a6, 1475 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t1, 0(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t0, 4(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a7, 8(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a3, 12(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 12 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a4, a7, 1475 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t2, 0(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t1, 4(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t0, 8(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a4, 12(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a4, t2, 491 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a6, t3, -1967 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a7, t4, -328 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi t0, t5, 1311 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a5, t6, -2048 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t0, 32(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a7, 36(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a6, 40(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a4, 44(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 13 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a6, t3, 491 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a7, t4, -1967 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi t0, t5, -328 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi t1, t6, 1311 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a5, s2, -2048 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t1, 32(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t0, 36(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a7, 40(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a6, 44(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: call callee_aligned_stack ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi sp, s0, -64 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 64 @@ -1272,17 +1272,17 @@ define i32 @caller_many_scalars() { ; ILP32E-FPELIM-NEXT: sw ra, 16(sp) # 4-byte Folded Spill ; ILP32E-FPELIM-NEXT: .cfi_offset ra, -4 ; ILP32E-FPELIM-NEXT: li a4, 8 -; ILP32E-FPELIM-NEXT: li a6, 7 -; ILP32E-FPELIM-NEXT: li a7, 6 +; ILP32E-FPELIM-NEXT: li a5, 7 +; ILP32E-FPELIM-NEXT: li a6, 6 ; ILP32E-FPELIM-NEXT: li a0, 1 ; ILP32E-FPELIM-NEXT: li a1, 2 ; ILP32E-FPELIM-NEXT: li a2, 3 ; ILP32E-FPELIM-NEXT: li a3, 4 -; ILP32E-FPELIM-NEXT: li a5, 5 -; ILP32E-FPELIM-NEXT: sw a7, 0(sp) -; ILP32E-FPELIM-NEXT: sw a6, 4(sp) +; ILP32E-FPELIM-NEXT: sw a6, 0(sp) +; ILP32E-FPELIM-NEXT: sw a5, 4(sp) ; ILP32E-FPELIM-NEXT: sw zero, 8(sp) ; ILP32E-FPELIM-NEXT: sw a4, 12(sp) +; ILP32E-FPELIM-NEXT: li a5, 5 ; ILP32E-FPELIM-NEXT: li a4, 0 ; ILP32E-FPELIM-NEXT: call callee_many_scalars ; ILP32E-FPELIM-NEXT: lw ra, 16(sp) # 4-byte Folded Reload @@ -1302,17 +1302,17 @@ define i32 @caller_many_scalars() { ; ILP32E-WITHFP-NEXT: addi s0, sp, 24 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: li a4, 8 -; ILP32E-WITHFP-NEXT: li a6, 7 -; ILP32E-WITHFP-NEXT: li a7, 6 +; ILP32E-WITHFP-NEXT: li a5, 7 +; ILP32E-WITHFP-NEXT: li a6, 6 ; ILP32E-WITHFP-NEXT: li a0, 1 ; ILP32E-WITHFP-NEXT: li a1, 2 ; ILP32E-WITHFP-NEXT: li a2, 3 ; ILP32E-WITHFP-NEXT: li a3, 4 -; ILP32E-WITHFP-NEXT: li a5, 5 -; ILP32E-WITHFP-NEXT: sw a7, 0(sp) -; ILP32E-WITHFP-NEXT: sw a6, 4(sp) +; ILP32E-WITHFP-NEXT: sw a6, 0(sp) +; ILP32E-WITHFP-NEXT: sw a5, 4(sp) ; ILP32E-WITHFP-NEXT: sw zero, 8(sp) ; ILP32E-WITHFP-NEXT: sw a4, 12(sp) +; ILP32E-WITHFP-NEXT: li a5, 5 ; ILP32E-WITHFP-NEXT: li a4, 0 ; ILP32E-WITHFP-NEXT: call callee_many_scalars ; ILP32E-WITHFP-NEXT: .cfi_def_cfa sp, 24 @@ -1332,17 +1332,17 @@ define i32 @caller_many_scalars() { ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi sp, sp, -16 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: .cfi_def_cfa_offset 20 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 8 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a6, 7 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a7, 6 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 7 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a6, 6 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a1, 2 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a2, 3 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 5 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a7, 0(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a6, 4(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a6, 0(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a5, 4(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 8(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a4, 12(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 5 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: call callee_many_scalars ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi sp, sp, 16 @@ -1360,17 +1360,17 @@ define i32 @caller_many_scalars() { ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi s0, sp, 24 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 8 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a6, 7 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a7, 6 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 7 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a6, 6 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a1, 2 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a2, 3 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 5 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a7, 0(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a6, 4(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a6, 0(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a5, 4(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 8(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a4, 12(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 5 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: call callee_many_scalars ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 24 @@ -1390,17 +1390,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) { ; ILP32E-FPELIM-NEXT: lw a3, 4(a1) ; ILP32E-FPELIM-NEXT: lw a4, 8(a1) ; ILP32E-FPELIM-NEXT: lw a1, 12(a1) -; ILP32E-FPELIM-NEXT: lw a5, 12(a0) +; ILP32E-FPELIM-NEXT: lw a5, 0(a0) ; ILP32E-FPELIM-NEXT: lw a6, 4(a0) ; ILP32E-FPELIM-NEXT: lw a7, 8(a0) -; ILP32E-FPELIM-NEXT: lw a0, 0(a0) -; ILP32E-FPELIM-NEXT: xor a1, a5, a1 -; ILP32E-FPELIM-NEXT: xor a3, a6, a3 -; ILP32E-FPELIM-NEXT: xor a4, a7, a4 -; ILP32E-FPELIM-NEXT: xor a0, a0, a2 -; ILP32E-FPELIM-NEXT: or a1, a3, a1 -; ILP32E-FPELIM-NEXT: or a0, a0, a4 -; ILP32E-FPELIM-NEXT: or a0, a0, a1 +; ILP32E-FPELIM-NEXT: lw a0, 12(a0) +; ILP32E-FPELIM-NEXT: xor a0, a0, a1 +; ILP32E-FPELIM-NEXT: xor a1, a6, a3 +; ILP32E-FPELIM-NEXT: xor a3, a7, a4 +; ILP32E-FPELIM-NEXT: xor a2, a5, a2 +; ILP32E-FPELIM-NEXT: or a0, a1, a0 +; ILP32E-FPELIM-NEXT: or a2, a2, a3 +; ILP32E-FPELIM-NEXT: or a0, a2, a0 ; ILP32E-FPELIM-NEXT: seqz a0, a0 ; ILP32E-FPELIM-NEXT: ret ; @@ -1418,17 +1418,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) { ; ILP32E-WITHFP-NEXT: lw a3, 4(a1) ; ILP32E-WITHFP-NEXT: lw a4, 8(a1) ; ILP32E-WITHFP-NEXT: lw a1, 12(a1) -; ILP32E-WITHFP-NEXT: lw a5, 12(a0) +; ILP32E-WITHFP-NEXT: lw a5, 0(a0) ; ILP32E-WITHFP-NEXT: lw a6, 4(a0) ; ILP32E-WITHFP-NEXT: lw a7, 8(a0) -; ILP32E-WITHFP-NEXT: lw a0, 0(a0) -; ILP32E-WITHFP-NEXT: xor a1, a5, a1 -; ILP32E-WITHFP-NEXT: xor a3, a6, a3 -; ILP32E-WITHFP-NEXT: xor a4, a7, a4 -; ILP32E-WITHFP-NEXT: xor a0, a0, a2 -; ILP32E-WITHFP-NEXT: or a1, a3, a1 -; ILP32E-WITHFP-NEXT: or a0, a0, a4 -; ILP32E-WITHFP-NEXT: or a0, a0, a1 +; ILP32E-WITHFP-NEXT: lw a0, 12(a0) +; ILP32E-WITHFP-NEXT: xor a0, a0, a1 +; ILP32E-WITHFP-NEXT: xor a1, a6, a3 +; ILP32E-WITHFP-NEXT: xor a3, a7, a4 +; ILP32E-WITHFP-NEXT: xor a2, a5, a2 +; ILP32E-WITHFP-NEXT: or a0, a1, a0 +; ILP32E-WITHFP-NEXT: or a2, a2, a3 +; ILP32E-WITHFP-NEXT: or a0, a2, a0 ; ILP32E-WITHFP-NEXT: seqz a0, a0 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa sp, 8 ; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload @@ -1445,17 +1445,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) { ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a3, 4(a1) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a4, 8(a1) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 12(a1) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a5, 12(a0) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a5, 0(a0) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a6, 4(a0) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a7, 8(a0) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a0, 0(a0) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a1, a5, a1 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a3, a6, a3 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a4, a7, a4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a0, a0, a2 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a1, a3, a1 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a0, a4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a0, a1 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a0, 12(a0) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a0, a0, a1 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a1, a6, a3 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a3, a7, a4 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a2, a5, a2 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a1, a0 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a2, a2, a3 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a2, a0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: seqz a0, a0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: ret ; @@ -1471,17 +1471,17 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) { ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a3, 4(a1) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a4, 8(a1) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 12(a1) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a5, 12(a0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a5, 0(a0) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a6, 4(a0) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a7, 8(a0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a0, 0(a0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a1, a5, a1 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a3, a6, a3 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a4, a7, a4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a0, a0, a2 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a1, a3, a1 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a0, a4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a0, a1 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a0, 12(a0) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a0, a0, a1 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a1, a6, a3 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a3, a7, a4 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a2, a5, a2 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a1, a0 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a2, a2, a3 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a2, a0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: seqz a0, a0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: tail __riscv_restore_1 @@ -1503,18 +1503,18 @@ define i32 @caller_large_scalars() { ; ILP32E-FPELIM-NEXT: addi s0, sp, 48 ; ILP32E-FPELIM-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-NEXT: lui a1, 524272 -; ILP32E-FPELIM-NEXT: li a2, 1 -; ILP32E-FPELIM-NEXT: addi a0, sp, 24 +; ILP32E-FPELIM-NEXT: lui a0, 524272 +; ILP32E-FPELIM-NEXT: li a1, 1 ; ILP32E-FPELIM-NEXT: sw zero, 0(sp) ; ILP32E-FPELIM-NEXT: sw zero, 4(sp) ; ILP32E-FPELIM-NEXT: sw zero, 8(sp) -; ILP32E-FPELIM-NEXT: sw a1, 12(sp) -; ILP32E-FPELIM-NEXT: mv a1, sp -; ILP32E-FPELIM-NEXT: sw a2, 24(sp) +; ILP32E-FPELIM-NEXT: sw a0, 12(sp) +; ILP32E-FPELIM-NEXT: addi a0, sp, 24 +; ILP32E-FPELIM-NEXT: sw a1, 24(sp) ; ILP32E-FPELIM-NEXT: sw zero, 28(sp) ; ILP32E-FPELIM-NEXT: sw zero, 32(sp) ; ILP32E-FPELIM-NEXT: sw zero, 36(sp) +; ILP32E-FPELIM-NEXT: mv a1, sp ; ILP32E-FPELIM-NEXT: call callee_large_scalars ; ILP32E-FPELIM-NEXT: addi sp, s0, -48 ; ILP32E-FPELIM-NEXT: .cfi_def_cfa sp, 48 @@ -1537,18 +1537,18 @@ define i32 @caller_large_scalars() { ; ILP32E-WITHFP-NEXT: addi s0, sp, 48 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-NEXT: lui a1, 524272 -; ILP32E-WITHFP-NEXT: li a2, 1 -; ILP32E-WITHFP-NEXT: addi a0, sp, 24 +; ILP32E-WITHFP-NEXT: lui a0, 524272 +; ILP32E-WITHFP-NEXT: li a1, 1 ; ILP32E-WITHFP-NEXT: sw zero, 0(sp) ; ILP32E-WITHFP-NEXT: sw zero, 4(sp) ; ILP32E-WITHFP-NEXT: sw zero, 8(sp) -; ILP32E-WITHFP-NEXT: sw a1, 12(sp) -; ILP32E-WITHFP-NEXT: mv a1, sp -; ILP32E-WITHFP-NEXT: sw a2, 24(sp) +; ILP32E-WITHFP-NEXT: sw a0, 12(sp) +; ILP32E-WITHFP-NEXT: addi a0, sp, 24 +; ILP32E-WITHFP-NEXT: sw a1, 24(sp) ; ILP32E-WITHFP-NEXT: sw zero, 28(sp) ; ILP32E-WITHFP-NEXT: sw zero, 32(sp) ; ILP32E-WITHFP-NEXT: sw zero, 36(sp) +; ILP32E-WITHFP-NEXT: mv a1, sp ; ILP32E-WITHFP-NEXT: call callee_large_scalars ; ILP32E-WITHFP-NEXT: addi sp, s0, -48 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa sp, 48 @@ -1571,18 +1571,18 @@ define i32 @caller_large_scalars() { ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi s0, sp, 48 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a1, 524272 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a2, 1 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a0, sp, 24 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a0, 524272 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a1, 1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 0(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 4(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 8(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a1, 12(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: mv a1, sp -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a2, 24(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a0, 12(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a0, sp, 24 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a1, 24(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 28(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 32(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 36(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: mv a1, sp ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: call callee_large_scalars ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi sp, s0, -48 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 48 @@ -1601,18 +1601,18 @@ define i32 @caller_large_scalars() { ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi s0, sp, 48 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a1, 524272 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a2, 1 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a0, sp, 24 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a0, 524272 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a1, 1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 0(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 4(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 8(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a1, 12(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: mv a1, sp -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a2, 24(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a0, 12(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a0, sp, 24 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a1, 24(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 28(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 32(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 36(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: mv a1, sp ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: call callee_large_scalars ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi sp, s0, -48 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 48 @@ -1636,17 +1636,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; ILP32E-FPELIM-NEXT: lw a3, 4(a0) ; ILP32E-FPELIM-NEXT: lw a4, 8(a0) ; ILP32E-FPELIM-NEXT: lw a0, 12(a0) -; ILP32E-FPELIM-NEXT: lw a5, 12(a1) +; ILP32E-FPELIM-NEXT: lw a5, 0(a1) ; ILP32E-FPELIM-NEXT: lw a6, 4(a1) ; ILP32E-FPELIM-NEXT: lw a7, 8(a1) -; ILP32E-FPELIM-NEXT: lw a1, 0(a1) -; ILP32E-FPELIM-NEXT: xor a0, a5, a0 -; ILP32E-FPELIM-NEXT: xor a3, a6, a3 -; ILP32E-FPELIM-NEXT: xor a4, a7, a4 -; ILP32E-FPELIM-NEXT: xor a1, a1, a2 -; ILP32E-FPELIM-NEXT: or a0, a3, a0 -; ILP32E-FPELIM-NEXT: or a1, a1, a4 +; ILP32E-FPELIM-NEXT: lw a1, 12(a1) +; ILP32E-FPELIM-NEXT: xor a0, a1, a0 +; ILP32E-FPELIM-NEXT: xor a1, a6, a3 +; ILP32E-FPELIM-NEXT: xor a3, a7, a4 +; ILP32E-FPELIM-NEXT: xor a2, a5, a2 ; ILP32E-FPELIM-NEXT: or a0, a1, a0 +; ILP32E-FPELIM-NEXT: or a2, a2, a3 +; ILP32E-FPELIM-NEXT: or a0, a2, a0 ; ILP32E-FPELIM-NEXT: seqz a0, a0 ; ILP32E-FPELIM-NEXT: ret ; @@ -1666,17 +1666,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; ILP32E-WITHFP-NEXT: lw a3, 4(a0) ; ILP32E-WITHFP-NEXT: lw a4, 8(a0) ; ILP32E-WITHFP-NEXT: lw a0, 12(a0) -; ILP32E-WITHFP-NEXT: lw a5, 12(a1) +; ILP32E-WITHFP-NEXT: lw a5, 0(a1) ; ILP32E-WITHFP-NEXT: lw a6, 4(a1) ; ILP32E-WITHFP-NEXT: lw a7, 8(a1) -; ILP32E-WITHFP-NEXT: lw a1, 0(a1) -; ILP32E-WITHFP-NEXT: xor a0, a5, a0 -; ILP32E-WITHFP-NEXT: xor a3, a6, a3 -; ILP32E-WITHFP-NEXT: xor a4, a7, a4 -; ILP32E-WITHFP-NEXT: xor a1, a1, a2 -; ILP32E-WITHFP-NEXT: or a0, a3, a0 -; ILP32E-WITHFP-NEXT: or a1, a1, a4 +; ILP32E-WITHFP-NEXT: lw a1, 12(a1) +; ILP32E-WITHFP-NEXT: xor a0, a1, a0 +; ILP32E-WITHFP-NEXT: xor a1, a6, a3 +; ILP32E-WITHFP-NEXT: xor a3, a7, a4 +; ILP32E-WITHFP-NEXT: xor a2, a5, a2 ; ILP32E-WITHFP-NEXT: or a0, a1, a0 +; ILP32E-WITHFP-NEXT: or a2, a2, a3 +; ILP32E-WITHFP-NEXT: or a0, a2, a0 ; ILP32E-WITHFP-NEXT: seqz a0, a0 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa sp, 8 ; ILP32E-WITHFP-NEXT: lw ra, 4(sp) # 4-byte Folded Reload @@ -1695,17 +1695,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a3, 4(a0) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a4, 8(a0) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a0, 12(a0) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a5, 12(a1) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a5, 0(a1) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a6, 4(a1) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a7, 8(a1) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 0(a1) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a0, a5, a0 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a3, a6, a3 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a4, a7, a4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a1, a1, a2 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a3, a0 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a1, a1, a4 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lw a1, 12(a1) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a0, a1, a0 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a1, a6, a3 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a3, a7, a4 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: xor a2, a5, a2 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a1, a0 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a2, a2, a3 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: or a0, a2, a0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: seqz a0, a0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: ret ; @@ -1723,17 +1723,17 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a3, 4(a0) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a4, 8(a0) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a0, 12(a0) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a5, 12(a1) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a5, 0(a1) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a6, 4(a1) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a7, 8(a1) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 0(a1) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a0, a5, a0 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a3, a6, a3 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a4, a7, a4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a1, a1, a2 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a3, a0 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a1, a1, a4 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lw a1, 12(a1) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a0, a1, a0 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a1, a6, a3 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a3, a7, a4 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: xor a2, a5, a2 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a1, a0 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a2, a2, a3 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: or a0, a2, a0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: seqz a0, a0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: tail __riscv_restore_1 @@ -1755,30 +1755,30 @@ define i32 @caller_large_scalars_exhausted_regs() { ; ILP32E-FPELIM-NEXT: addi s0, sp, 64 ; ILP32E-FPELIM-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-NEXT: addi a4, sp, 16 -; ILP32E-FPELIM-NEXT: li a5, 9 -; ILP32E-FPELIM-NEXT: addi a6, sp, 40 -; ILP32E-FPELIM-NEXT: li a7, 7 -; ILP32E-FPELIM-NEXT: lui t0, 524272 -; ILP32E-FPELIM-NEXT: li t1, 8 +; ILP32E-FPELIM-NEXT: addi a3, sp, 16 +; ILP32E-FPELIM-NEXT: li a4, 9 +; ILP32E-FPELIM-NEXT: addi a5, sp, 40 +; ILP32E-FPELIM-NEXT: li a6, 7 +; ILP32E-FPELIM-NEXT: lui a7, 524272 +; ILP32E-FPELIM-NEXT: li t0, 8 ; ILP32E-FPELIM-NEXT: li a0, 1 ; ILP32E-FPELIM-NEXT: li a1, 2 ; ILP32E-FPELIM-NEXT: li a2, 3 +; ILP32E-FPELIM-NEXT: sw a6, 0(sp) +; ILP32E-FPELIM-NEXT: sw a5, 4(sp) +; ILP32E-FPELIM-NEXT: sw a4, 8(sp) +; ILP32E-FPELIM-NEXT: sw a3, 12(sp) ; ILP32E-FPELIM-NEXT: li a3, 4 -; ILP32E-FPELIM-NEXT: sw a7, 0(sp) -; ILP32E-FPELIM-NEXT: sw a6, 4(sp) -; ILP32E-FPELIM-NEXT: sw a5, 8(sp) -; ILP32E-FPELIM-NEXT: sw a4, 12(sp) -; ILP32E-FPELIM-NEXT: li a4, 5 ; ILP32E-FPELIM-NEXT: sw zero, 16(sp) ; ILP32E-FPELIM-NEXT: sw zero, 20(sp) ; ILP32E-FPELIM-NEXT: sw zero, 24(sp) -; ILP32E-FPELIM-NEXT: sw t0, 28(sp) -; ILP32E-FPELIM-NEXT: li a5, 6 -; ILP32E-FPELIM-NEXT: sw t1, 40(sp) +; ILP32E-FPELIM-NEXT: sw a7, 28(sp) +; ILP32E-FPELIM-NEXT: li a4, 5 +; ILP32E-FPELIM-NEXT: sw t0, 40(sp) ; ILP32E-FPELIM-NEXT: sw zero, 44(sp) ; ILP32E-FPELIM-NEXT: sw zero, 48(sp) ; ILP32E-FPELIM-NEXT: sw zero, 52(sp) +; ILP32E-FPELIM-NEXT: li a5, 6 ; ILP32E-FPELIM-NEXT: call callee_large_scalars_exhausted_regs ; ILP32E-FPELIM-NEXT: addi sp, s0, -64 ; ILP32E-FPELIM-NEXT: .cfi_def_cfa sp, 64 @@ -1801,30 +1801,30 @@ define i32 @caller_large_scalars_exhausted_regs() { ; ILP32E-WITHFP-NEXT: addi s0, sp, 64 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-NEXT: addi a4, sp, 16 -; ILP32E-WITHFP-NEXT: li a5, 9 -; ILP32E-WITHFP-NEXT: addi a6, sp, 40 -; ILP32E-WITHFP-NEXT: li a7, 7 -; ILP32E-WITHFP-NEXT: lui t0, 524272 -; ILP32E-WITHFP-NEXT: li t1, 8 +; ILP32E-WITHFP-NEXT: addi a3, sp, 16 +; ILP32E-WITHFP-NEXT: li a4, 9 +; ILP32E-WITHFP-NEXT: addi a5, sp, 40 +; ILP32E-WITHFP-NEXT: li a6, 7 +; ILP32E-WITHFP-NEXT: lui a7, 524272 +; ILP32E-WITHFP-NEXT: li t0, 8 ; ILP32E-WITHFP-NEXT: li a0, 1 ; ILP32E-WITHFP-NEXT: li a1, 2 ; ILP32E-WITHFP-NEXT: li a2, 3 +; ILP32E-WITHFP-NEXT: sw a6, 0(sp) +; ILP32E-WITHFP-NEXT: sw a5, 4(sp) +; ILP32E-WITHFP-NEXT: sw a4, 8(sp) +; ILP32E-WITHFP-NEXT: sw a3, 12(sp) ; ILP32E-WITHFP-NEXT: li a3, 4 -; ILP32E-WITHFP-NEXT: sw a7, 0(sp) -; ILP32E-WITHFP-NEXT: sw a6, 4(sp) -; ILP32E-WITHFP-NEXT: sw a5, 8(sp) -; ILP32E-WITHFP-NEXT: sw a4, 12(sp) -; ILP32E-WITHFP-NEXT: li a4, 5 ; ILP32E-WITHFP-NEXT: sw zero, 16(sp) ; ILP32E-WITHFP-NEXT: sw zero, 20(sp) ; ILP32E-WITHFP-NEXT: sw zero, 24(sp) -; ILP32E-WITHFP-NEXT: sw t0, 28(sp) -; ILP32E-WITHFP-NEXT: li a5, 6 -; ILP32E-WITHFP-NEXT: sw t1, 40(sp) +; ILP32E-WITHFP-NEXT: sw a7, 28(sp) +; ILP32E-WITHFP-NEXT: li a4, 5 +; ILP32E-WITHFP-NEXT: sw t0, 40(sp) ; ILP32E-WITHFP-NEXT: sw zero, 44(sp) ; ILP32E-WITHFP-NEXT: sw zero, 48(sp) ; ILP32E-WITHFP-NEXT: sw zero, 52(sp) +; ILP32E-WITHFP-NEXT: li a5, 6 ; ILP32E-WITHFP-NEXT: call callee_large_scalars_exhausted_regs ; ILP32E-WITHFP-NEXT: addi sp, s0, -64 ; ILP32E-WITHFP-NEXT: .cfi_def_cfa sp, 64 @@ -1847,30 +1847,30 @@ define i32 @caller_large_scalars_exhausted_regs() { ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi s0, sp, 64 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a4, sp, 16 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 9 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a6, sp, 40 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a7, 7 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui t0, 524272 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t1, 8 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a3, sp, 16 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 9 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi a5, sp, 40 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a6, 7 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: lui a7, 524272 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li t0, 8 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a1, 2 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a2, 3 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a6, 0(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a5, 4(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a4, 8(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a3, 12(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a3, 4 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a7, 0(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a6, 4(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a5, 8(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a4, 12(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 5 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 16(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 20(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 24(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t0, 28(sp) -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 6 -; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t1, 40(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw a7, 28(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a4, 5 +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw t0, 40(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 44(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 48(sp) ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: sw zero, 52(sp) +; ILP32E-FPELIM-SAVE-RESTORE-NEXT: li a5, 6 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: call callee_large_scalars_exhausted_regs ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: addi sp, s0, -64 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 64 @@ -1889,30 +1889,30 @@ define i32 @caller_large_scalars_exhausted_regs() { ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi s0, sp, 64 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa s0, 0 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a4, sp, 16 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 9 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a6, sp, 40 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a7, 7 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui t0, 524272 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t1, 8 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a3, sp, 16 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 9 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi a5, sp, 40 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a6, 7 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: lui a7, 524272 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li t0, 8 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a0, 1 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a1, 2 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a2, 3 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a6, 0(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a5, 4(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a4, 8(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a3, 12(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a3, 4 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a7, 0(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a6, 4(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a5, 8(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a4, 12(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 5 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 16(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 20(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 24(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t0, 28(sp) -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 6 -; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t1, 40(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw a7, 28(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a4, 5 +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw t0, 40(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 44(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 48(sp) ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: sw zero, 52(sp) +; ILP32E-WITHFP-SAVE-RESTORE-NEXT: li a5, 6 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: call callee_large_scalars_exhausted_regs ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: addi sp, s0, -64 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT: .cfi_def_cfa sp, 64 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll index dabd2a7ce9a73..cb98422ebd3ae 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll @@ -59,9 +59,9 @@ define i32 @caller_float_in_fpr_exhausted_gprs() nounwind { ; RV32-ILP32FD-NEXT: li a0, 1 ; RV32-ILP32FD-NEXT: li a2, 2 ; RV32-ILP32FD-NEXT: li a4, 3 +; RV32-ILP32FD-NEXT: sw a1, 0(sp) ; RV32-ILP32FD-NEXT: fmv.w.x fa0, a3 ; RV32-ILP32FD-NEXT: li a6, 4 -; RV32-ILP32FD-NEXT: sw a1, 0(sp) ; RV32-ILP32FD-NEXT: li a1, 0 ; RV32-ILP32FD-NEXT: li a3, 0 ; RV32-ILP32FD-NEXT: li a5, 0 @@ -141,28 +141,28 @@ define i32 @caller_float_on_stack_exhausted_gprs_fprs() nounwind { ; RV32-ILP32FD: # %bb.0: ; RV32-ILP32FD-NEXT: addi sp, sp, -16 ; RV32-ILP32FD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ILP32FD-NEXT: lui a1, 267520 -; RV32-ILP32FD-NEXT: lui a0, 262144 +; RV32-ILP32FD-NEXT: lui a0, 267520 +; RV32-ILP32FD-NEXT: lui a1, 262144 ; RV32-ILP32FD-NEXT: lui a2, 264192 ; RV32-ILP32FD-NEXT: lui a3, 265216 -; RV32-ILP32FD-NEXT: lui a4, 266240 -; RV32-ILP32FD-NEXT: lui a5, 266496 -; RV32-ILP32FD-NEXT: lui a6, 266752 -; RV32-ILP32FD-NEXT: lui a7, 267008 -; RV32-ILP32FD-NEXT: fmv.w.x fa0, a0 -; RV32-ILP32FD-NEXT: lui t0, 267264 -; RV32-ILP32FD-NEXT: fmv.w.x fa1, a2 +; RV32-ILP32FD-NEXT: lui a5, 266240 +; RV32-ILP32FD-NEXT: lui a6, 266496 +; RV32-ILP32FD-NEXT: lui a7, 266752 +; RV32-ILP32FD-NEXT: lui t0, 267008 +; RV32-ILP32FD-NEXT: sw a0, 0(sp) +; RV32-ILP32FD-NEXT: lui t1, 267264 +; RV32-ILP32FD-NEXT: fmv.w.x fa0, a1 ; RV32-ILP32FD-NEXT: li a0, 1 -; RV32-ILP32FD-NEXT: fmv.w.x fa2, a3 +; RV32-ILP32FD-NEXT: fmv.w.x fa1, a2 ; RV32-ILP32FD-NEXT: li a2, 3 -; RV32-ILP32FD-NEXT: fmv.w.x fa3, a4 +; RV32-ILP32FD-NEXT: fmv.w.x fa2, a3 ; RV32-ILP32FD-NEXT: li a4, 5 -; RV32-ILP32FD-NEXT: fmv.w.x fa4, a5 -; RV32-ILP32FD-NEXT: fmv.w.x fa5, a6 -; RV32-ILP32FD-NEXT: fmv.w.x fa6, a7 -; RV32-ILP32FD-NEXT: fmv.w.x fa7, t0 +; RV32-ILP32FD-NEXT: fmv.w.x fa3, a5 +; RV32-ILP32FD-NEXT: fmv.w.x fa4, a6 +; RV32-ILP32FD-NEXT: fmv.w.x fa5, a7 +; RV32-ILP32FD-NEXT: fmv.w.x fa6, t0 +; RV32-ILP32FD-NEXT: fmv.w.x fa7, t1 ; RV32-ILP32FD-NEXT: li a6, 7 -; RV32-ILP32FD-NEXT: sw a1, 0(sp) ; RV32-ILP32FD-NEXT: li a1, 0 ; RV32-ILP32FD-NEXT: li a3, 0 ; RV32-ILP32FD-NEXT: li a5, 0 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll index 746b71a08a30b..219fca5e48c52 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll @@ -89,9 +89,9 @@ define i32 @caller_many_scalars() nounwind { ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a5, 5 ; RV64I-NEXT: li a6, 6 -; RV64I-NEXT: li a7, 7 ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: li a7, 7 ; RV64I-NEXT: li a4, 0 ; RV64I-NEXT: call callee_many_scalars ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -110,17 +110,17 @@ define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind { ; RV64I-NEXT: ld a3, 8(a1) ; RV64I-NEXT: ld a4, 16(a1) ; RV64I-NEXT: ld a1, 24(a1) -; RV64I-NEXT: ld a5, 24(a0) +; RV64I-NEXT: ld a5, 0(a0) ; RV64I-NEXT: ld a6, 8(a0) ; RV64I-NEXT: ld a7, 16(a0) -; RV64I-NEXT: ld a0, 0(a0) -; RV64I-NEXT: xor a1, a5, a1 -; RV64I-NEXT: xor a3, a6, a3 -; RV64I-NEXT: xor a4, a7, a4 -; RV64I-NEXT: xor a0, a0, a2 -; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: ld a0, 24(a0) +; RV64I-NEXT: xor a0, a0, a1 +; RV64I-NEXT: xor a1, a6, a3 +; RV64I-NEXT: xor a3, a7, a4 +; RV64I-NEXT: xor a2, a5, a2 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: or a2, a2, a3 +; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: ret %1 = icmp eq i256 %a, %b @@ -133,18 +133,18 @@ define i64 @caller_large_scalars() nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -80 ; RV64I-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; RV64I-NEXT: li a2, 2 -; RV64I-NEXT: li a3, 1 +; RV64I-NEXT: li a1, 2 +; RV64I-NEXT: li a2, 1 ; RV64I-NEXT: addi a0, sp, 32 -; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: sd a2, 0(sp) +; RV64I-NEXT: sd a1, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: sd a3, 32(sp) +; RV64I-NEXT: sd a2, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) +; RV64I-NEXT: mv a1, sp ; RV64I-NEXT: call callee_large_scalars ; RV64I-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 80 @@ -165,17 +165,17 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d, ; RV64I-NEXT: ld a2, 8(a7) ; RV64I-NEXT: ld a3, 16(a7) ; RV64I-NEXT: ld a4, 24(a7) -; RV64I-NEXT: ld a5, 24(a0) +; RV64I-NEXT: ld a5, 0(a0) ; RV64I-NEXT: ld a6, 8(a0) ; RV64I-NEXT: ld a7, 16(a0) -; RV64I-NEXT: ld a0, 0(a0) -; RV64I-NEXT: xor a4, a4, a5 +; RV64I-NEXT: ld a0, 24(a0) +; RV64I-NEXT: xor a0, a4, a0 ; RV64I-NEXT: xor a2, a2, a6 ; RV64I-NEXT: xor a3, a3, a7 -; RV64I-NEXT: xor a0, a1, a0 -; RV64I-NEXT: or a2, a2, a4 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: xor a1, a1, a5 +; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: or a1, a1, a3 +; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: ret %1 = icmp eq i256 %h, %j @@ -188,28 +188,28 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -96 ; RV64I-NEXT: sd ra, 88(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a7, sp, 16 -; RV64I-NEXT: li t0, 9 -; RV64I-NEXT: li t1, 10 -; RV64I-NEXT: li t2, 8 +; RV64I-NEXT: addi a6, sp, 16 +; RV64I-NEXT: li a7, 9 +; RV64I-NEXT: li t0, 10 +; RV64I-NEXT: li t1, 8 ; RV64I-NEXT: li a0, 1 ; RV64I-NEXT: li a1, 2 ; RV64I-NEXT: li a2, 3 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 5 ; RV64I-NEXT: li a5, 6 +; RV64I-NEXT: sd a7, 0(sp) +; RV64I-NEXT: sd a6, 8(sp) ; RV64I-NEXT: li a6, 7 -; RV64I-NEXT: sd t0, 0(sp) -; RV64I-NEXT: sd a7, 8(sp) -; RV64I-NEXT: addi a7, sp, 48 -; RV64I-NEXT: sd t1, 16(sp) +; RV64I-NEXT: sd t0, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) -; RV64I-NEXT: sd t2, 48(sp) +; RV64I-NEXT: sd t1, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) ; RV64I-NEXT: sd zero, 64(sp) ; RV64I-NEXT: sd zero, 72(sp) +; RV64I-NEXT: addi a7, sp, 48 ; RV64I-NEXT: call callee_large_scalars_exhausted_regs ; RV64I-NEXT: ld ra, 88(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 96 @@ -329,13 +329,13 @@ define i64 @callee_aligned_stack(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i128 %f ; RV64I-LABEL: callee_aligned_stack: ; RV64I: # %bb.0: ; RV64I-NEXT: ld a0, 32(sp) -; RV64I-NEXT: ld a1, 0(sp) -; RV64I-NEXT: ld a2, 16(sp) +; RV64I-NEXT: ld a1, 16(sp) +; RV64I-NEXT: ld a2, 0(sp) ; RV64I-NEXT: ld a3, 40(sp) ; RV64I-NEXT: add a5, a5, a7 -; RV64I-NEXT: add a1, a5, a1 -; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: add a0, a2, a0 ; RV64I-NEXT: add a0, a0, a3 ; RV64I-NEXT: ret %f_trunc = trunc i128 %f to i64 @@ -356,24 +356,24 @@ define void @caller_aligned_stack() nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: li a6, 12 -; RV64I-NEXT: li a7, 11 -; RV64I-NEXT: li t0, 10 -; RV64I-NEXT: li t1, 9 -; RV64I-NEXT: li t2, 8 +; RV64I-NEXT: li a5, 12 +; RV64I-NEXT: li a6, 11 +; RV64I-NEXT: li a7, 10 +; RV64I-NEXT: li t0, 9 +; RV64I-NEXT: li t1, 8 ; RV64I-NEXT: li a0, 1 ; RV64I-NEXT: li a1, 2 ; RV64I-NEXT: li a2, 3 ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: li a4, 5 +; RV64I-NEXT: sd a6, 40(sp) +; RV64I-NEXT: sd a5, 48(sp) ; RV64I-NEXT: li a5, 6 -; RV64I-NEXT: sd a7, 40(sp) -; RV64I-NEXT: sd a6, 48(sp) -; RV64I-NEXT: li a7, 7 -; RV64I-NEXT: sd t2, 0(sp) -; RV64I-NEXT: sd t1, 16(sp) +; RV64I-NEXT: sd t1, 0(sp) +; RV64I-NEXT: sd t0, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: sd t0, 32(sp) +; RV64I-NEXT: sd a7, 32(sp) +; RV64I-NEXT: li a7, 7 ; RV64I-NEXT: li a6, 0 ; RV64I-NEXT: call callee_aligned_stack ; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64.ll index c2db8fe5248fd..d43f43ceffec3 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-lp64.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64.ll @@ -112,8 +112,8 @@ define i64 @caller_float_on_stack() nounwind { ; RV64I-FPELIM-NEXT: li a0, 1 ; RV64I-FPELIM-NEXT: li a2, 2 ; RV64I-FPELIM-NEXT: li a4, 3 -; RV64I-FPELIM-NEXT: li a6, 4 ; RV64I-FPELIM-NEXT: sd a1, 0(sp) +; RV64I-FPELIM-NEXT: li a6, 4 ; RV64I-FPELIM-NEXT: li a1, 0 ; RV64I-FPELIM-NEXT: li a3, 0 ; RV64I-FPELIM-NEXT: li a5, 0 @@ -133,8 +133,8 @@ define i64 @caller_float_on_stack() nounwind { ; RV64I-WITHFP-NEXT: li a0, 1 ; RV64I-WITHFP-NEXT: li a2, 2 ; RV64I-WITHFP-NEXT: li a4, 3 -; RV64I-WITHFP-NEXT: li a6, 4 ; RV64I-WITHFP-NEXT: sd a1, 0(sp) +; RV64I-WITHFP-NEXT: li a6, 4 ; RV64I-WITHFP-NEXT: li a1, 0 ; RV64I-WITHFP-NEXT: li a3, 0 ; RV64I-WITHFP-NEXT: li a5, 0 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64e.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64e.ll index 985135a086e24..cc10e900faa0b 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-lp64e.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64e.ll @@ -118,10 +118,10 @@ define i64 @caller_float_on_stack() nounwind { ; RV64I-LP64E-FPELIM-NEXT: li a3, 4 ; RV64I-LP64E-FPELIM-NEXT: li a0, 1 ; RV64I-LP64E-FPELIM-NEXT: li a2, 2 -; RV64I-LP64E-FPELIM-NEXT: li a4, 3 ; RV64I-LP64E-FPELIM-NEXT: sd a3, 0(sp) ; RV64I-LP64E-FPELIM-NEXT: sd zero, 8(sp) ; RV64I-LP64E-FPELIM-NEXT: sd a1, 16(sp) +; RV64I-LP64E-FPELIM-NEXT: li a4, 3 ; RV64I-LP64E-FPELIM-NEXT: li a1, 0 ; RV64I-LP64E-FPELIM-NEXT: li a3, 0 ; RV64I-LP64E-FPELIM-NEXT: li a5, 0 @@ -143,10 +143,10 @@ define i64 @caller_float_on_stack() nounwind { ; RV64I-LP64E-WITHFP-NEXT: li a3, 4 ; RV64I-LP64E-WITHFP-NEXT: li a0, 1 ; RV64I-LP64E-WITHFP-NEXT: li a2, 2 -; RV64I-LP64E-WITHFP-NEXT: li a4, 3 ; RV64I-LP64E-WITHFP-NEXT: sd a3, 0(sp) ; RV64I-LP64E-WITHFP-NEXT: sd zero, 8(sp) ; RV64I-LP64E-WITHFP-NEXT: sd a1, 16(sp) +; RV64I-LP64E-WITHFP-NEXT: li a4, 3 ; RV64I-LP64E-WITHFP-NEXT: li a1, 0 ; RV64I-LP64E-WITHFP-NEXT: li a3, 0 ; RV64I-LP64E-WITHFP-NEXT: li a5, 0 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32.ll b/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32.ll index eaba1acffa054..284de1988d37e 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32.ll @@ -37,9 +37,9 @@ define float @caller_onstack_f32_noop(float %a) nounwind { ; RV32IF-NEXT: li a0, 1 ; RV32IF-NEXT: li a2, 2 ; RV32IF-NEXT: li a4, 3 -; RV32IF-NEXT: li a6, 4 ; RV32IF-NEXT: sw a3, 0(sp) ; RV32IF-NEXT: sw a1, 4(sp) +; RV32IF-NEXT: li a6, 4 ; RV32IF-NEXT: li a1, 0 ; RV32IF-NEXT: li a3, 0 ; RV32IF-NEXT: li a5, 0 @@ -61,12 +61,12 @@ define float @caller_onstack_f32_fadd(float %a, float %b) nounwind { ; RV32IF-NEXT: fmv.w.x fa4, a0 ; RV32IF-NEXT: fadd.s fa3, fa4, fa5 ; RV32IF-NEXT: fsub.s fa5, fa5, fa4 +; RV32IF-NEXT: fsw fa3, 0(sp) +; RV32IF-NEXT: fsw fa5, 4(sp) ; RV32IF-NEXT: li a0, 1 ; RV32IF-NEXT: li a2, 2 ; RV32IF-NEXT: li a4, 3 ; RV32IF-NEXT: li a6, 4 -; RV32IF-NEXT: fsw fa3, 0(sp) -; RV32IF-NEXT: fsw fa5, 4(sp) ; RV32IF-NEXT: li a1, 0 ; RV32IF-NEXT: li a3, 0 ; RV32IF-NEXT: li a5, 0 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32e.ll index 63d4ea5fee331..6bc0e773f0aff 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32e.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-rv32f-ilp32e.ll @@ -34,14 +34,14 @@ define float @caller_onstack_f32_noop(float %a) nounwind { ; RV32IF-ILP32E-NEXT: sw ra, 16(sp) # 4-byte Folded Spill ; RV32IF-ILP32E-NEXT: mv a1, a0 ; RV32IF-ILP32E-NEXT: lui a3, 264704 -; RV32IF-ILP32E-NEXT: li a5, 4 +; RV32IF-ILP32E-NEXT: li a4, 4 ; RV32IF-ILP32E-NEXT: li a0, 1 ; RV32IF-ILP32E-NEXT: li a2, 2 -; RV32IF-ILP32E-NEXT: li a4, 3 -; RV32IF-ILP32E-NEXT: sw a5, 0(sp) +; RV32IF-ILP32E-NEXT: sw a4, 0(sp) ; RV32IF-ILP32E-NEXT: sw zero, 4(sp) ; RV32IF-ILP32E-NEXT: sw a3, 8(sp) ; RV32IF-ILP32E-NEXT: sw a1, 12(sp) +; RV32IF-ILP32E-NEXT: li a4, 3 ; RV32IF-ILP32E-NEXT: li a1, 0 ; RV32IF-ILP32E-NEXT: li a3, 0 ; RV32IF-ILP32E-NEXT: li a5, 0 @@ -65,11 +65,11 @@ define float @caller_onstack_f32_fadd(float %a, float %b) nounwind { ; RV32IF-ILP32E-NEXT: li a1, 4 ; RV32IF-ILP32E-NEXT: li a0, 1 ; RV32IF-ILP32E-NEXT: li a2, 2 -; RV32IF-ILP32E-NEXT: li a4, 3 ; RV32IF-ILP32E-NEXT: sw a1, 0(sp) ; RV32IF-ILP32E-NEXT: sw zero, 4(sp) ; RV32IF-ILP32E-NEXT: fsw fa3, 8(sp) ; RV32IF-ILP32E-NEXT: fsw fa5, 12(sp) +; RV32IF-ILP32E-NEXT: li a4, 3 ; RV32IF-ILP32E-NEXT: li a1, 0 ; RV32IF-ILP32E-NEXT: li a3, 0 ; RV32IF-ILP32E-NEXT: li a5, 0 diff --git a/llvm/test/CodeGen/RISCV/calls.ll b/llvm/test/CodeGen/RISCV/calls.ll index cf0e625f3c6c7..6aef8b18f5b77 100644 --- a/llvm/test/CodeGen/RISCV/calls.ll +++ b/llvm/test/CodeGen/RISCV/calls.ll @@ -654,11 +654,11 @@ define i32 @test_call_external_many_args(i32 %a) nounwind { ; RV64I-LARGE-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-LARGE-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-LARGE-NEXT: mv s0, a0 +; RV64I-LARGE-NEXT: sd a0, 0(sp) +; RV64I-LARGE-NEXT: sd a0, 8(sp) ; RV64I-LARGE-NEXT: .Lpcrel_hi4: ; RV64I-LARGE-NEXT: auipc a0, %pcrel_hi(.LCPI8_0) ; RV64I-LARGE-NEXT: ld t1, %pcrel_lo(.Lpcrel_hi4)(a0) -; RV64I-LARGE-NEXT: sd s0, 0(sp) -; RV64I-LARGE-NEXT: sd s0, 8(sp) ; RV64I-LARGE-NEXT: mv a0, s0 ; RV64I-LARGE-NEXT: mv a1, s0 ; RV64I-LARGE-NEXT: mv a2, s0 @@ -681,11 +681,11 @@ define i32 @test_call_external_many_args(i32 %a) nounwind { ; RV64I-LARGE-ZICFILP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-LARGE-ZICFILP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-LARGE-ZICFILP-NEXT: mv s0, a0 +; RV64I-LARGE-ZICFILP-NEXT: sd a0, 0(sp) +; RV64I-LARGE-ZICFILP-NEXT: sd a0, 8(sp) ; RV64I-LARGE-ZICFILP-NEXT: .Lpcrel_hi4: ; RV64I-LARGE-ZICFILP-NEXT: auipc a0, %pcrel_hi(.LCPI8_0) ; RV64I-LARGE-ZICFILP-NEXT: ld t2, %pcrel_lo(.Lpcrel_hi4)(a0) -; RV64I-LARGE-ZICFILP-NEXT: sd s0, 0(sp) -; RV64I-LARGE-ZICFILP-NEXT: sd s0, 8(sp) ; RV64I-LARGE-ZICFILP-NEXT: mv a0, s0 ; RV64I-LARGE-ZICFILP-NEXT: mv a1, s0 ; RV64I-LARGE-ZICFILP-NEXT: mv a2, s0 @@ -823,11 +823,11 @@ define i32 @test_call_defined_many_args(i32 %a) nounwind { ; RV64I-LARGE: # %bb.0: ; RV64I-LARGE-NEXT: addi sp, sp, -32 ; RV64I-LARGE-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64I-LARGE-NEXT: sd a0, 0(sp) +; RV64I-LARGE-NEXT: sd a0, 8(sp) ; RV64I-LARGE-NEXT: .Lpcrel_hi5: ; RV64I-LARGE-NEXT: auipc a1, %pcrel_hi(.LCPI10_0) ; RV64I-LARGE-NEXT: ld t1, %pcrel_lo(.Lpcrel_hi5)(a1) -; RV64I-LARGE-NEXT: sd a0, 0(sp) -; RV64I-LARGE-NEXT: sd a0, 8(sp) ; RV64I-LARGE-NEXT: mv a1, a0 ; RV64I-LARGE-NEXT: mv a2, a0 ; RV64I-LARGE-NEXT: mv a3, a0 @@ -845,11 +845,11 @@ define i32 @test_call_defined_many_args(i32 %a) nounwind { ; RV64I-LARGE-ZICFILP-NEXT: lpad 0 ; RV64I-LARGE-ZICFILP-NEXT: addi sp, sp, -32 ; RV64I-LARGE-ZICFILP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64I-LARGE-ZICFILP-NEXT: sd a0, 0(sp) +; RV64I-LARGE-ZICFILP-NEXT: sd a0, 8(sp) ; RV64I-LARGE-ZICFILP-NEXT: .Lpcrel_hi5: ; RV64I-LARGE-ZICFILP-NEXT: auipc a1, %pcrel_hi(.LCPI10_0) ; RV64I-LARGE-ZICFILP-NEXT: ld t2, %pcrel_lo(.Lpcrel_hi5)(a1) -; RV64I-LARGE-ZICFILP-NEXT: sd a0, 0(sp) -; RV64I-LARGE-ZICFILP-NEXT: sd a0, 8(sp) ; RV64I-LARGE-ZICFILP-NEXT: mv a1, a0 ; RV64I-LARGE-ZICFILP-NEXT: mv a2, a0 ; RV64I-LARGE-ZICFILP-NEXT: mv a3, a0 diff --git a/llvm/test/CodeGen/RISCV/codemodel-lowering.ll b/llvm/test/CodeGen/RISCV/codemodel-lowering.ll index 4831f0b24c7fe..ab8460d944b33 100644 --- a/llvm/test/CodeGen/RISCV/codemodel-lowering.ll +++ b/llvm/test/CodeGen/RISCV/codemodel-lowering.ll @@ -119,9 +119,9 @@ define signext i32 @lower_blockaddress_displ(i32 signext %w) nounwind { ; RV32I-SMALL-NEXT: addi sp, sp, -16 ; RV32I-SMALL-NEXT: lui a1, %hi(.Ltmp0) ; RV32I-SMALL-NEXT: addi a1, a1, %lo(.Ltmp0) -; RV32I-SMALL-NEXT: li a2, 101 ; RV32I-SMALL-NEXT: sw a1, 8(sp) -; RV32I-SMALL-NEXT: blt a0, a2, .LBB2_3 +; RV32I-SMALL-NEXT: li a1, 101 +; RV32I-SMALL-NEXT: blt a0, a1, .LBB2_3 ; RV32I-SMALL-NEXT: # %bb.1: # %if.then ; RV32I-SMALL-NEXT: lw a0, 8(sp) ; RV32I-SMALL-NEXT: jr a0 @@ -141,9 +141,9 @@ define signext i32 @lower_blockaddress_displ(i32 signext %w) nounwind { ; RV32I-MEDIUM-NEXT: .Lpcrel_hi2: ; RV32I-MEDIUM-NEXT: auipc a1, %pcrel_hi(.Ltmp0) ; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi2) -; RV32I-MEDIUM-NEXT: li a2, 101 ; RV32I-MEDIUM-NEXT: sw a1, 8(sp) -; RV32I-MEDIUM-NEXT: blt a0, a2, .LBB2_3 +; RV32I-MEDIUM-NEXT: li a1, 101 +; RV32I-MEDIUM-NEXT: blt a0, a1, .LBB2_3 ; RV32I-MEDIUM-NEXT: # %bb.1: # %if.then ; RV32I-MEDIUM-NEXT: lw a0, 8(sp) ; RV32I-MEDIUM-NEXT: jr a0 @@ -162,9 +162,9 @@ define signext i32 @lower_blockaddress_displ(i32 signext %w) nounwind { ; RV64I-SMALL-NEXT: addi sp, sp, -16 ; RV64I-SMALL-NEXT: lui a1, %hi(.Ltmp0) ; RV64I-SMALL-NEXT: addi a1, a1, %lo(.Ltmp0) -; RV64I-SMALL-NEXT: li a2, 101 ; RV64I-SMALL-NEXT: sd a1, 8(sp) -; RV64I-SMALL-NEXT: blt a0, a2, .LBB2_3 +; RV64I-SMALL-NEXT: li a1, 101 +; RV64I-SMALL-NEXT: blt a0, a1, .LBB2_3 ; RV64I-SMALL-NEXT: # %bb.1: # %if.then ; RV64I-SMALL-NEXT: ld a0, 8(sp) ; RV64I-SMALL-NEXT: jr a0 @@ -184,9 +184,9 @@ define signext i32 @lower_blockaddress_displ(i32 signext %w) nounwind { ; RV64I-MEDIUM-NEXT: .Lpcrel_hi2: ; RV64I-MEDIUM-NEXT: auipc a1, %pcrel_hi(.Ltmp0) ; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi2) -; RV64I-MEDIUM-NEXT: li a2, 101 ; RV64I-MEDIUM-NEXT: sd a1, 8(sp) -; RV64I-MEDIUM-NEXT: blt a0, a2, .LBB2_3 +; RV64I-MEDIUM-NEXT: li a1, 101 +; RV64I-MEDIUM-NEXT: blt a0, a1, .LBB2_3 ; RV64I-MEDIUM-NEXT: # %bb.1: # %if.then ; RV64I-MEDIUM-NEXT: ld a0, 8(sp) ; RV64I-MEDIUM-NEXT: jr a0 @@ -206,9 +206,9 @@ define signext i32 @lower_blockaddress_displ(i32 signext %w) nounwind { ; RV64I-LARGE-NEXT: .Lpcrel_hi2: ; RV64I-LARGE-NEXT: auipc a1, %pcrel_hi(.Ltmp0) ; RV64I-LARGE-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi2) -; RV64I-LARGE-NEXT: li a2, 101 ; RV64I-LARGE-NEXT: sd a1, 8(sp) -; RV64I-LARGE-NEXT: blt a0, a2, .LBB2_3 +; RV64I-LARGE-NEXT: li a1, 101 +; RV64I-LARGE-NEXT: blt a0, a1, .LBB2_3 ; RV64I-LARGE-NEXT: # %bb.1: # %if.then ; RV64I-LARGE-NEXT: ld a0, 8(sp) ; RV64I-LARGE-NEXT: jr a0 diff --git a/llvm/test/CodeGen/RISCV/condbinops.ll b/llvm/test/CodeGen/RISCV/condbinops.ll index dc81c13bfb6a3..e898661665e99 100644 --- a/llvm/test/CodeGen/RISCV/condbinops.ll +++ b/llvm/test/CodeGen/RISCV/condbinops.ll @@ -411,8 +411,8 @@ define i64 @shl64(i64 %x, i64 %y, i1 %c) { ; RV32I-NEXT: slli a4, a4, 31 ; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a4, a4, a2 -; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: sll a2, a0, a4 +; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: bltz a3, .LBB8_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a1, a2 @@ -486,8 +486,8 @@ define i64 @ashr64(i64 %x, i64 %y, i1 %c) { ; RV32I-NEXT: slli a4, a4, 31 ; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a2, a4, a2 -; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: sra a0, a1, a2 +; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: bltz a4, .LBB9_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srai a1, a1, 31 @@ -496,10 +496,9 @@ define i64 @ashr64(i64 %x, i64 %y, i1 %c) { ; RV32I-NEXT: srl a3, a3, a2 ; RV32I-NEXT: not a2, a2 ; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: sll a1, a1, a2 -; RV32I-NEXT: or a3, a3, a1 +; RV32I-NEXT: sll a2, a1, a2 ; RV32I-NEXT: mv a1, a0 -; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: or a0, a3, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: ashr64: @@ -562,8 +561,8 @@ define i64 @lshr64(i64 %x, i64 %y, i1 %c) { ; RV32I-NEXT: slli a4, a4, 31 ; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a4, a4, a2 -; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: srl a2, a1, a4 +; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: bltz a3, .LBB10_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a2 diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll index 6c2ba493ffcd5..bd9e543e955d5 100644 --- a/llvm/test/CodeGen/RISCV/condops.ll +++ b/llvm/test/CodeGen/RISCV/condops.ll @@ -1348,13 +1348,13 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: beqz a1, .LBB23_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: beqz a0, .LBB23_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a6 +; RV32I-NEXT: mv a4, a6 ; RV32I-NEXT: mv a5, a7 ; RV32I-NEXT: .LBB23_2: +; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: ret ; @@ -1425,13 +1425,13 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: bnez a1, .LBB24_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: bnez a0, .LBB24_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a6 +; RV32I-NEXT: mv a4, a6 ; RV32I-NEXT: mv a5, a7 ; RV32I-NEXT: .LBB24_2: +; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: mv a1, a5 ; RV32I-NEXT: ret ; @@ -2196,13 +2196,13 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) { define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: seteq_zero: ; RV32I: # %bb.0: -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: beqz a1, .LBB33_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: beqz a0, .LBB33_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB33_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; @@ -2264,13 +2264,13 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) { define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: setne_zero: ; RV32I: # %bb.0: -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a1, .LBB34_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: bnez a0, .LBB34_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB34_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; @@ -2333,13 +2333,13 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: seteq_constant: ; RV32I: # %bb.0: ; RV32I-NEXT: xori a0, a0, 123 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: beqz a1, .LBB35_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: beqz a0, .LBB35_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB35_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; @@ -2408,13 +2408,13 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: setne_constant: ; RV32I: # %bb.0: ; RV32I-NEXT: xori a0, a0, 456 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a1, .LBB36_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: bnez a0, .LBB36_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB36_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; @@ -2483,13 +2483,13 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I-LABEL: seteq_2048: ; RV32I: # %bb.0: ; RV32I-NEXT: binvi a0, a0, 11 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: beqz a1, .LBB37_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: beqz a0, .LBB37_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB37_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; @@ -2559,13 +2559,13 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: not a1, a1 ; RV32I-NEXT: xori a0, a0, -2048 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: beqz a1, .LBB38_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: beqz a0, .LBB38_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB38_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; @@ -2637,13 +2637,13 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) { ; RV32I: # %bb.0: ; RV32I-NEXT: not a1, a1 ; RV32I-NEXT: xori a0, a0, -2048 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a1, .LBB39_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: bnez a0, .LBB39_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: mv a3, a5 ; RV32I-NEXT: .LBB39_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/copysign-casts.ll b/llvm/test/CodeGen/RISCV/copysign-casts.ll index 53de36f1699a9..5400ec6d005ef 100644 --- a/llvm/test/CodeGen/RISCV/copysign-casts.ll +++ b/llvm/test/CodeGen/RISCV/copysign-casts.ll @@ -702,17 +702,17 @@ define half @fold_demote_h_d(half %a, double %b) nounwind { ; RV32IFD-LABEL: fold_demote_h_d: ; RV32IFD: # %bb.0: ; RV32IFD-NEXT: addi sp, sp, -16 +; RV32IFD-NEXT: fmv.x.w a0, fa0 ; RV32IFD-NEXT: fsd fa1, 8(sp) -; RV32IFD-NEXT: lw a0, 12(sp) -; RV32IFD-NEXT: fmv.x.w a1, fa0 -; RV32IFD-NEXT: lui a2, 524288 -; RV32IFD-NEXT: and a0, a0, a2 +; RV32IFD-NEXT: lui a1, 524288 +; RV32IFD-NEXT: lw a2, 12(sp) +; RV32IFD-NEXT: and a1, a2, a1 ; RV32IFD-NEXT: lui a2, 1048560 -; RV32IFD-NEXT: slli a1, a1, 17 -; RV32IFD-NEXT: srli a1, a1, 17 -; RV32IFD-NEXT: srli a0, a0, 16 -; RV32IFD-NEXT: or a1, a1, a2 -; RV32IFD-NEXT: or a0, a1, a0 +; RV32IFD-NEXT: slli a0, a0, 17 +; RV32IFD-NEXT: srli a0, a0, 17 +; RV32IFD-NEXT: srli a1, a1, 16 +; RV32IFD-NEXT: or a0, a0, a2 +; RV32IFD-NEXT: or a0, a0, a1 ; RV32IFD-NEXT: fmv.w.x fa0, a0 ; RV32IFD-NEXT: addi sp, sp, 16 ; RV32IFD-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index da97ac0d74237..a098de49f8410 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -475,10 +475,10 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV64M: # %bb.0: ; RV64M-NEXT: beqz a0, .LBB3_2 ; RV64M-NEXT: # %bb.1: # %cond.false +; RV64M-NEXT: neg a1, a0 +; RV64M-NEXT: and a0, a0, a1 ; RV64M-NEXT: lui a1, %hi(.LCPI3_0) ; RV64M-NEXT: ld a1, %lo(.LCPI3_0)(a1) -; RV64M-NEXT: neg a2, a0 -; RV64M-NEXT: and a0, a0, a2 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srli a0, a0, 58 ; RV64M-NEXT: lui a1, %hi(.LCPI3_1) @@ -889,10 +889,10 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; ; RV64M-LABEL: test_cttz_i64_zero_undef: ; RV64M: # %bb.0: +; RV64M-NEXT: neg a1, a0 +; RV64M-NEXT: and a0, a0, a1 ; RV64M-NEXT: lui a1, %hi(.LCPI7_0) ; RV64M-NEXT: ld a1, %lo(.LCPI7_0)(a1) -; RV64M-NEXT: neg a2, a0 -; RV64M-NEXT: and a0, a0, a2 ; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: srli a0, a0, 58 ; RV64M-NEXT: lui a1, %hi(.LCPI7_1) diff --git a/llvm/test/CodeGen/RISCV/double-calling-conv.ll b/llvm/test/CodeGen/RISCV/double-calling-conv.ll index 798eac64e9fc2..51f75c10462d0 100644 --- a/llvm/test/CodeGen/RISCV/double-calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/double-calling-conv.ll @@ -93,8 +93,8 @@ define double @callee_double_split_reg_stack(i32 %a, i64 %b, i64 %c, double %d, ; RV32IZFINXZDINX-LABEL: callee_double_split_reg_stack: ; RV32IZFINXZDINX: # %bb.0: ; RV32IZFINXZDINX-NEXT: mv a0, a7 -; RV32IZFINXZDINX-NEXT: lw a1, 0(sp) ; RV32IZFINXZDINX-NEXT: mv a3, a6 +; RV32IZFINXZDINX-NEXT: lw a1, 0(sp) ; RV32IZFINXZDINX-NEXT: mv a2, a5 ; RV32IZFINXZDINX-NEXT: fadd.d a0, a2, a0 ; RV32IZFINXZDINX-NEXT: ret @@ -115,8 +115,8 @@ define double @caller_double_split_reg_stack() nounwind { ; RV32IFD-NEXT: addi a2, a2, 327 ; RV32IFD-NEXT: addi a6, a3, 327 ; RV32IFD-NEXT: addi a5, a4, -1311 -; RV32IFD-NEXT: li a3, 3 ; RV32IFD-NEXT: sw a2, 0(sp) +; RV32IFD-NEXT: li a3, 3 ; RV32IFD-NEXT: li a2, 0 ; RV32IFD-NEXT: li a4, 0 ; RV32IFD-NEXT: mv a7, a5 @@ -137,8 +137,8 @@ define double @caller_double_split_reg_stack() nounwind { ; RV32IZFINXZDINX-NEXT: addi a2, a2, 327 ; RV32IZFINXZDINX-NEXT: addi a6, a3, 327 ; RV32IZFINXZDINX-NEXT: addi a5, a4, -1311 -; RV32IZFINXZDINX-NEXT: li a3, 3 ; RV32IZFINXZDINX-NEXT: sw a2, 0(sp) +; RV32IZFINXZDINX-NEXT: li a3, 3 ; RV32IZFINXZDINX-NEXT: li a2, 0 ; RV32IZFINXZDINX-NEXT: li a4, 0 ; RV32IZFINXZDINX-NEXT: mv a7, a5 @@ -186,7 +186,6 @@ define double @caller_double_stack() nounwind { ; RV32IFD-NEXT: li a0, 1 ; RV32IFD-NEXT: li a2, 2 ; RV32IFD-NEXT: li a4, 3 -; RV32IFD-NEXT: li a6, 4 ; RV32IFD-NEXT: addi a1, a1, 327 ; RV32IFD-NEXT: addi a3, a3, -1311 ; RV32IFD-NEXT: addi a5, a5, 327 @@ -194,6 +193,7 @@ define double @caller_double_stack() nounwind { ; RV32IFD-NEXT: sw a1, 4(sp) ; RV32IFD-NEXT: sw a3, 8(sp) ; RV32IFD-NEXT: sw a5, 12(sp) +; RV32IFD-NEXT: li a6, 4 ; RV32IFD-NEXT: li a1, 0 ; RV32IFD-NEXT: li a3, 0 ; RV32IFD-NEXT: li a5, 0 @@ -213,7 +213,6 @@ define double @caller_double_stack() nounwind { ; RV32IZFINXZDINX-NEXT: li a0, 1 ; RV32IZFINXZDINX-NEXT: li a2, 2 ; RV32IZFINXZDINX-NEXT: li a4, 3 -; RV32IZFINXZDINX-NEXT: li a6, 4 ; RV32IZFINXZDINX-NEXT: addi a1, a1, 327 ; RV32IZFINXZDINX-NEXT: addi a3, a3, -1311 ; RV32IZFINXZDINX-NEXT: addi a5, a5, 327 @@ -221,6 +220,7 @@ define double @caller_double_stack() nounwind { ; RV32IZFINXZDINX-NEXT: sw a1, 4(sp) ; RV32IZFINXZDINX-NEXT: sw a3, 8(sp) ; RV32IZFINXZDINX-NEXT: sw a5, 12(sp) +; RV32IZFINXZDINX-NEXT: li a6, 4 ; RV32IZFINXZDINX-NEXT: li a1, 0 ; RV32IZFINXZDINX-NEXT: li a3, 0 ; RV32IZFINXZDINX-NEXT: li a5, 0 diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll index c39085a80ddc1..052cfd6adff06 100644 --- a/llvm/test/CodeGen/RISCV/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/double-convert.ll @@ -687,9 +687,9 @@ define i64 @fcvt_l_d_sat(double %a) nounwind { ; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: lui a0, %hi(.LCPI12_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi ; RV32IFD-NEXT: lui a3, 524288 @@ -1624,13 +1624,13 @@ define signext i16 @fcvt_w_s_i16(double %a) nounwind { define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind { ; RV32IFD-LABEL: fcvt_w_s_sat_i16: ; RV32IFD: # %bb.0: # %start -; RV32IFD-NEXT: lui a0, %hi(.LCPI26_0) -; RV32IFD-NEXT: fld fa5, %lo(.LCPI26_0)(a0) -; RV32IFD-NEXT: lui a0, %hi(.LCPI26_1) -; RV32IFD-NEXT: fld fa4, %lo(.LCPI26_1)(a0) ; RV32IFD-NEXT: feq.d a0, fa0, fa0 -; RV32IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV32IFD-NEXT: lui a1, %hi(.LCPI26_0) +; RV32IFD-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32IFD-NEXT: lui a1, %hi(.LCPI26_1) ; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV32IFD-NEXT: fld fa4, %lo(.LCPI26_1)(a1) ; RV32IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV32IFD-NEXT: fcvt.w.d a1, fa5, rtz ; RV32IFD-NEXT: and a0, a0, a1 @@ -1638,13 +1638,13 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind { ; ; RV64IFD-LABEL: fcvt_w_s_sat_i16: ; RV64IFD: # %bb.0: # %start -; RV64IFD-NEXT: lui a0, %hi(.LCPI26_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI26_0)(a0) -; RV64IFD-NEXT: lui a0, %hi(.LCPI26_1) -; RV64IFD-NEXT: fld fa4, %lo(.LCPI26_1)(a0) ; RV64IFD-NEXT: feq.d a0, fa0, fa0 -; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: lui a1, %hi(.LCPI26_0) +; RV64IFD-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV64IFD-NEXT: lui a1, %hi(.LCPI26_1) ; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: fld fa4, %lo(.LCPI26_1)(a1) ; RV64IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV64IFD-NEXT: fcvt.l.d a1, fa5, rtz ; RV64IFD-NEXT: and a0, a0, a1 @@ -1653,31 +1653,31 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind { ; RV32IZFINXZDINX-LABEL: fcvt_w_s_sat_i16: ; RV32IZFINXZDINX: # %bb.0: # %start ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI26_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI26_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI26_0)(a2) -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI26_1) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI26_1+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI26_1)(a4) -; RV32IZFINXZDINX-NEXT: fmax.d a2, a0, a2 +; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI26_1) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI26_0)(a2) +; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI26_0+4)(a2) +; RV32IZFINXZDINX-NEXT: fmax.d a4, a0, a4 ; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 ; RV32IZFINXZDINX-NEXT: neg a0, a0 -; RV32IZFINXZDINX-NEXT: fmin.d a2, a2, a4 +; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI26_1)(a3) +; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI26_1+4)(a3) +; RV32IZFINXZDINX-NEXT: fmin.d a2, a4, a2 ; RV32IZFINXZDINX-NEXT: fcvt.w.d a1, a2, rtz ; RV32IZFINXZDINX-NEXT: and a0, a0, a1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i16: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: li a1, -505 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: li a2, -505 +; RV64IZFINXZDINX-NEXT: slli a2, a2, 53 +; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, a2 ; RV64IZFINXZDINX-NEXT: lui a2, %hi(.LCPI26_0) -; RV64IZFINXZDINX-NEXT: slli a1, a1, 53 +; RV64IZFINXZDINX-NEXT: neg a1, a1 ; RV64IZFINXZDINX-NEXT: ld a2, %lo(.LCPI26_0)(a2) -; RV64IZFINXZDINX-NEXT: fmax.d a1, a0, a1 -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: neg a0, a0 -; RV64IZFINXZDINX-NEXT: fmin.d a1, a1, a2 -; RV64IZFINXZDINX-NEXT: fcvt.l.d a1, a1, rtz -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: fmin.d a0, a0, a2 +; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_s_sat_i16: @@ -1829,40 +1829,40 @@ define zeroext i16 @fcvt_wu_s_i16(double %a) nounwind { define zeroext i16 @fcvt_wu_s_sat_i16(double %a) nounwind { ; RV32IFD-LABEL: fcvt_wu_s_sat_i16: ; RV32IFD: # %bb.0: # %start +; RV32IFD-NEXT: fcvt.d.w fa5, zero ; RV32IFD-NEXT: lui a0, %hi(.LCPI28_0) -; RV32IFD-NEXT: fld fa5, %lo(.LCPI28_0)(a0) -; RV32IFD-NEXT: fcvt.d.w fa4, zero -; RV32IFD-NEXT: fmax.d fa4, fa0, fa4 -; RV32IFD-NEXT: fmin.d fa5, fa4, fa5 +; RV32IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV32IFD-NEXT: fld fa4, %lo(.LCPI28_0)(a0) +; RV32IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV32IFD-NEXT: fcvt.wu.d a0, fa5, rtz ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fcvt_wu_s_sat_i16: ; RV64IFD: # %bb.0: # %start +; RV64IFD-NEXT: fmv.d.x fa5, zero ; RV64IFD-NEXT: lui a0, %hi(.LCPI28_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI28_0)(a0) -; RV64IFD-NEXT: fmv.d.x fa4, zero -; RV64IFD-NEXT: fmax.d fa4, fa0, fa4 -; RV64IFD-NEXT: fmin.d fa5, fa4, fa5 +; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: fld fa4, %lo(.LCPI28_0)(a0) +; RV64IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV64IFD-NEXT: fcvt.lu.d a0, fa5, rtz ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcvt_wu_s_sat_i16: ; RV32IZFINXZDINX: # %bb.0: # %start +; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero +; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a2 ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI28_0) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI28_0+4)(a2) ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI28_0)(a2) -; RV32IZFINXZDINX-NEXT: fcvt.d.w a4, zero -; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a4 ; RV32IZFINXZDINX-NEXT: fmin.d a0, a0, a2 ; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rtz ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcvt_wu_s_sat_i16: ; RV64IZFINXZDINX: # %bb.0: # %start +; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, zero ; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI28_0) ; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI28_0)(a1) -; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, zero ; RV64IZFINXZDINX-NEXT: fmin.d a0, a0, a1 ; RV64IZFINXZDINX-NEXT: fcvt.lu.d a0, a0, rtz ; RV64IZFINXZDINX-NEXT: ret @@ -1999,13 +1999,13 @@ define signext i8 @fcvt_w_s_i8(double %a) nounwind { define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind { ; RV32IFD-LABEL: fcvt_w_s_sat_i8: ; RV32IFD: # %bb.0: # %start -; RV32IFD-NEXT: lui a0, %hi(.LCPI30_0) -; RV32IFD-NEXT: fld fa5, %lo(.LCPI30_0)(a0) -; RV32IFD-NEXT: lui a0, %hi(.LCPI30_1) -; RV32IFD-NEXT: fld fa4, %lo(.LCPI30_1)(a0) ; RV32IFD-NEXT: feq.d a0, fa0, fa0 -; RV32IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV32IFD-NEXT: lui a1, %hi(.LCPI30_0) +; RV32IFD-NEXT: fld fa5, %lo(.LCPI30_0)(a1) +; RV32IFD-NEXT: lui a1, %hi(.LCPI30_1) ; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV32IFD-NEXT: fld fa4, %lo(.LCPI30_1)(a1) ; RV32IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV32IFD-NEXT: fcvt.w.d a1, fa5, rtz ; RV32IFD-NEXT: and a0, a0, a1 @@ -2013,13 +2013,13 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind { ; ; RV64IFD-LABEL: fcvt_w_s_sat_i8: ; RV64IFD: # %bb.0: # %start -; RV64IFD-NEXT: lui a0, %hi(.LCPI30_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI30_0)(a0) -; RV64IFD-NEXT: lui a0, %hi(.LCPI30_1) -; RV64IFD-NEXT: fld fa4, %lo(.LCPI30_1)(a0) ; RV64IFD-NEXT: feq.d a0, fa0, fa0 -; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: lui a1, %hi(.LCPI30_0) +; RV64IFD-NEXT: fld fa5, %lo(.LCPI30_0)(a1) +; RV64IFD-NEXT: lui a1, %hi(.LCPI30_1) ; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: fld fa4, %lo(.LCPI30_1)(a1) ; RV64IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV64IFD-NEXT: fcvt.l.d a1, fa5, rtz ; RV64IFD-NEXT: and a0, a0, a1 @@ -2028,15 +2028,15 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind { ; RV32IZFINXZDINX-LABEL: fcvt_w_s_sat_i8: ; RV32IZFINXZDINX: # %bb.0: # %start ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI30_0) -; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI30_0+4)(a2) -; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI30_0)(a2) -; RV32IZFINXZDINX-NEXT: lui a4, %hi(.LCPI30_1) -; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI30_1+4)(a4) -; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI30_1)(a4) -; RV32IZFINXZDINX-NEXT: fmax.d a2, a0, a2 +; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI30_1) +; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI30_0)(a2) +; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI30_0+4)(a2) +; RV32IZFINXZDINX-NEXT: fmax.d a4, a0, a4 ; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 ; RV32IZFINXZDINX-NEXT: neg a0, a0 -; RV32IZFINXZDINX-NEXT: fmin.d a2, a2, a4 +; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI30_1)(a3) +; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI30_1+4)(a3) +; RV32IZFINXZDINX-NEXT: fmin.d a2, a4, a2 ; RV32IZFINXZDINX-NEXT: fcvt.w.d a1, a2, rtz ; RV32IZFINXZDINX-NEXT: and a0, a0, a1 ; RV32IZFINXZDINX-NEXT: ret @@ -2203,31 +2203,31 @@ define zeroext i8 @fcvt_wu_s_sat_i8(double %a) nounwind { ; ; RV32IFD-LABEL: fcvt_wu_s_sat_i8: ; RV32IFD: # %bb.0: # %start +; RV32IFD-NEXT: fcvt.d.w fa5, zero ; RV32IFD-NEXT: lui a0, %hi(.LCPI32_0) -; RV32IFD-NEXT: fld fa5, %lo(.LCPI32_0)(a0) -; RV32IFD-NEXT: fcvt.d.w fa4, zero -; RV32IFD-NEXT: fmax.d fa4, fa0, fa4 -; RV32IFD-NEXT: fmin.d fa5, fa4, fa5 +; RV32IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV32IFD-NEXT: fld fa4, %lo(.LCPI32_0)(a0) +; RV32IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV32IFD-NEXT: fcvt.wu.d a0, fa5, rtz ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fcvt_wu_s_sat_i8: ; RV64IFD: # %bb.0: # %start +; RV64IFD-NEXT: fmv.d.x fa5, zero ; RV64IFD-NEXT: lui a0, %hi(.LCPI32_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI32_0)(a0) -; RV64IFD-NEXT: fmv.d.x fa4, zero -; RV64IFD-NEXT: fmax.d fa4, fa0, fa4 -; RV64IFD-NEXT: fmin.d fa5, fa4, fa5 +; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: fld fa4, %lo(.LCPI32_0)(a0) +; RV64IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV64IFD-NEXT: fcvt.lu.d a0, fa5, rtz ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcvt_wu_s_sat_i8: ; RV32IZFINXZDINX: # %bb.0: # %start +; RV32IZFINXZDINX-NEXT: fcvt.d.w a2, zero +; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a2 ; RV32IZFINXZDINX-NEXT: lui a2, %hi(.LCPI32_0) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI32_0+4)(a2) ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI32_0)(a2) -; RV32IZFINXZDINX-NEXT: fcvt.d.w a4, zero -; RV32IZFINXZDINX-NEXT: fmax.d a0, a0, a4 ; RV32IZFINXZDINX-NEXT: fmin.d a0, a0, a2 ; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rtz ; RV32IZFINXZDINX-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll index 949668f640dbd..30f995207851f 100644 --- a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll +++ b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll @@ -275,8 +275,8 @@ define i32 @fcmp_one(double %a, double %b) nounwind strictfp { ; CHECKIFD-NEXT: frflags a0 ; CHECKIFD-NEXT: flt.d a2, fa1, fa0 ; CHECKIFD-NEXT: fsflags a0 -; CHECKIFD-NEXT: or a0, a2, a1 ; CHECKIFD-NEXT: feq.d zero, fa1, fa0 +; CHECKIFD-NEXT: or a0, a2, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcmp_one: @@ -288,9 +288,8 @@ define i32 @fcmp_one(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: frflags a4 ; RV32IZFINXZDINX-NEXT: flt.d a6, a2, a0 ; RV32IZFINXZDINX-NEXT: fsflags a4 -; RV32IZFINXZDINX-NEXT: or a4, a6, a5 ; RV32IZFINXZDINX-NEXT: feq.d zero, a2, a0 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: or a0, a6, a5 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_one: @@ -302,9 +301,8 @@ define i32 @fcmp_one(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: frflags a2 ; RV64IZFINXZDINX-NEXT: flt.d a4, a1, a0 ; RV64IZFINXZDINX-NEXT: fsflags a2 -; RV64IZFINXZDINX-NEXT: or a2, a4, a3 ; RV64IZFINXZDINX-NEXT: feq.d zero, a1, a0 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: or a0, a4, a3 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_one: @@ -423,9 +421,9 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp { ; CHECKIFD-NEXT: frflags a0 ; CHECKIFD-NEXT: flt.d a2, fa1, fa0 ; CHECKIFD-NEXT: fsflags a0 +; CHECKIFD-NEXT: feq.d zero, fa1, fa0 ; CHECKIFD-NEXT: or a1, a2, a1 ; CHECKIFD-NEXT: xori a0, a1, 1 -; CHECKIFD-NEXT: feq.d zero, fa1, fa0 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcmp_ueq: @@ -437,10 +435,9 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: frflags a4 ; RV32IZFINXZDINX-NEXT: flt.d a6, a2, a0 ; RV32IZFINXZDINX-NEXT: fsflags a4 -; RV32IZFINXZDINX-NEXT: or a4, a6, a5 -; RV32IZFINXZDINX-NEXT: xori a4, a4, 1 ; RV32IZFINXZDINX-NEXT: feq.d zero, a2, a0 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: or a0, a6, a5 +; RV32IZFINXZDINX-NEXT: xori a0, a0, 1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_ueq: @@ -452,10 +449,9 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: frflags a2 ; RV64IZFINXZDINX-NEXT: flt.d a4, a1, a0 ; RV64IZFINXZDINX-NEXT: fsflags a2 -; RV64IZFINXZDINX-NEXT: or a3, a4, a3 -; RV64IZFINXZDINX-NEXT: xori a2, a3, 1 ; RV64IZFINXZDINX-NEXT: feq.d zero, a1, a0 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: or a3, a4, a3 +; RV64IZFINXZDINX-NEXT: xori a0, a3, 1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ueq: @@ -522,8 +518,8 @@ define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp { ; CHECKIFD-NEXT: frflags a0 ; CHECKIFD-NEXT: fle.d a1, fa0, fa1 ; CHECKIFD-NEXT: fsflags a0 -; CHECKIFD-NEXT: xori a0, a1, 1 ; CHECKIFD-NEXT: feq.d zero, fa0, fa1 +; CHECKIFD-NEXT: xori a0, a1, 1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcmp_ugt: @@ -531,9 +527,8 @@ define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: frflags a4 ; RV32IZFINXZDINX-NEXT: fle.d a5, a0, a2 ; RV32IZFINXZDINX-NEXT: fsflags a4 -; RV32IZFINXZDINX-NEXT: xori a4, a5, 1 ; RV32IZFINXZDINX-NEXT: feq.d zero, a0, a2 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: xori a0, a5, 1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_ugt: @@ -541,9 +536,8 @@ define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: frflags a2 ; RV64IZFINXZDINX-NEXT: fle.d a3, a0, a1 ; RV64IZFINXZDINX-NEXT: fsflags a2 -; RV64IZFINXZDINX-NEXT: xori a2, a3, 1 ; RV64IZFINXZDINX-NEXT: feq.d zero, a0, a1 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: xori a0, a3, 1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ugt: @@ -576,8 +570,8 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp { ; CHECKIFD-NEXT: frflags a0 ; CHECKIFD-NEXT: flt.d a1, fa0, fa1 ; CHECKIFD-NEXT: fsflags a0 -; CHECKIFD-NEXT: xori a0, a1, 1 ; CHECKIFD-NEXT: feq.d zero, fa0, fa1 +; CHECKIFD-NEXT: xori a0, a1, 1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcmp_uge: @@ -585,9 +579,8 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: frflags a4 ; RV32IZFINXZDINX-NEXT: flt.d a5, a0, a2 ; RV32IZFINXZDINX-NEXT: fsflags a4 -; RV32IZFINXZDINX-NEXT: xori a4, a5, 1 ; RV32IZFINXZDINX-NEXT: feq.d zero, a0, a2 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: xori a0, a5, 1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_uge: @@ -595,9 +588,8 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: frflags a2 ; RV64IZFINXZDINX-NEXT: flt.d a3, a0, a1 ; RV64IZFINXZDINX-NEXT: fsflags a2 -; RV64IZFINXZDINX-NEXT: xori a2, a3, 1 ; RV64IZFINXZDINX-NEXT: feq.d zero, a0, a1 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: xori a0, a3, 1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_uge: @@ -632,8 +624,8 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp { ; CHECKIFD-NEXT: frflags a0 ; CHECKIFD-NEXT: fle.d a1, fa1, fa0 ; CHECKIFD-NEXT: fsflags a0 -; CHECKIFD-NEXT: xori a0, a1, 1 ; CHECKIFD-NEXT: feq.d zero, fa1, fa0 +; CHECKIFD-NEXT: xori a0, a1, 1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcmp_ult: @@ -641,9 +633,8 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: frflags a4 ; RV32IZFINXZDINX-NEXT: fle.d a5, a2, a0 ; RV32IZFINXZDINX-NEXT: fsflags a4 -; RV32IZFINXZDINX-NEXT: xori a4, a5, 1 ; RV32IZFINXZDINX-NEXT: feq.d zero, a2, a0 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: xori a0, a5, 1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_ult: @@ -651,9 +642,8 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: frflags a2 ; RV64IZFINXZDINX-NEXT: fle.d a3, a1, a0 ; RV64IZFINXZDINX-NEXT: fsflags a2 -; RV64IZFINXZDINX-NEXT: xori a2, a3, 1 ; RV64IZFINXZDINX-NEXT: feq.d zero, a1, a0 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: xori a0, a3, 1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ult: @@ -686,8 +676,8 @@ define i32 @fcmp_ule(double %a, double %b) nounwind strictfp { ; CHECKIFD-NEXT: frflags a0 ; CHECKIFD-NEXT: flt.d a1, fa1, fa0 ; CHECKIFD-NEXT: fsflags a0 -; CHECKIFD-NEXT: xori a0, a1, 1 ; CHECKIFD-NEXT: feq.d zero, fa1, fa0 +; CHECKIFD-NEXT: xori a0, a1, 1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcmp_ule: @@ -695,9 +685,8 @@ define i32 @fcmp_ule(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: frflags a4 ; RV32IZFINXZDINX-NEXT: flt.d a5, a2, a0 ; RV32IZFINXZDINX-NEXT: fsflags a4 -; RV32IZFINXZDINX-NEXT: xori a4, a5, 1 ; RV32IZFINXZDINX-NEXT: feq.d zero, a2, a0 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: xori a0, a5, 1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_ule: @@ -705,9 +694,8 @@ define i32 @fcmp_ule(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: frflags a2 ; RV64IZFINXZDINX-NEXT: flt.d a3, a1, a0 ; RV64IZFINXZDINX-NEXT: fsflags a2 -; RV64IZFINXZDINX-NEXT: xori a2, a3, 1 ; RV64IZFINXZDINX-NEXT: feq.d zero, a1, a0 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: xori a0, a3, 1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ule: diff --git a/llvm/test/CodeGen/RISCV/double-imm.ll b/llvm/test/CodeGen/RISCV/double-imm.ll index 155827ad069cc..97fc1bde6155f 100644 --- a/llvm/test/CodeGen/RISCV/double-imm.ll +++ b/llvm/test/CodeGen/RISCV/double-imm.ll @@ -158,12 +158,12 @@ define dso_local double @negzero_sel(i16 noundef %a, double noundef %d) nounwind ; ; CHECKRV64ZDINX-LABEL: negzero_sel: ; CHECKRV64ZDINX: # %bb.0: # %entry -; CHECKRV64ZDINX-NEXT: slli a2, a0, 48 -; CHECKRV64ZDINX-NEXT: mv a0, a1 -; CHECKRV64ZDINX-NEXT: beqz a2, .LBB4_2 +; CHECKRV64ZDINX-NEXT: slli a0, a0, 48 +; CHECKRV64ZDINX-NEXT: beqz a0, .LBB4_2 ; CHECKRV64ZDINX-NEXT: # %bb.1: # %entry -; CHECKRV64ZDINX-NEXT: fneg.d a0, zero +; CHECKRV64ZDINX-NEXT: fneg.d a1, zero ; CHECKRV64ZDINX-NEXT: .LBB4_2: # %entry +; CHECKRV64ZDINX-NEXT: mv a0, a1 ; CHECKRV64ZDINX-NEXT: ret entry: %tobool.not = icmp eq i16 %a, 0 diff --git a/llvm/test/CodeGen/RISCV/double-mem.ll b/llvm/test/CodeGen/RISCV/double-mem.ll index dba9489e7511d..134c8cb0689ca 100644 --- a/llvm/test/CodeGen/RISCV/double-mem.ll +++ b/llvm/test/CodeGen/RISCV/double-mem.ll @@ -51,10 +51,10 @@ define dso_local void @fsd(ptr %a, double %b, double %c) nounwind { ; RV32IZFINXZDINX-LABEL: fsd: ; RV32IZFINXZDINX: # %bb.0: ; RV32IZFINXZDINX-NEXT: mv a5, a4 -; RV32IZFINXZDINX-NEXT: mv a7, a2 ; RV32IZFINXZDINX-NEXT: mv a4, a3 -; RV32IZFINXZDINX-NEXT: mv a6, a1 -; RV32IZFINXZDINX-NEXT: fadd.d a2, a6, a4 +; RV32IZFINXZDINX-NEXT: mv a3, a2 +; RV32IZFINXZDINX-NEXT: mv a2, a1 +; RV32IZFINXZDINX-NEXT: fadd.d a2, a2, a4 ; RV32IZFINXZDINX-NEXT: sw a2, 0(a0) ; RV32IZFINXZDINX-NEXT: sw a3, 4(a0) ; RV32IZFINXZDINX-NEXT: sw a2, 64(a0) diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll index cd87f2d2301d7..8ebeeabec4a09 100644 --- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll @@ -48,9 +48,9 @@ define i64 @test_floor_si64(double %x) nounwind { ; RV32IFD-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill ; RV32IFD-NEXT: call floor +; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: lui a0, %hi(.LCPI1_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI1_0)(a0) -; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi ; RV32IFD-NEXT: lui a3, 524288 @@ -103,9 +103,9 @@ define i64 @test_floor_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI1_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI1_0)(a2) ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI1_0+4)(a2) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI1_1)(a3) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI1_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 @@ -185,12 +185,12 @@ define i64 @test_floor_ui64(double %x) nounwind { ; RV32IFD-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: call floor ; RV32IFD-NEXT: lui a0, %hi(.LCPI3_0) +; RV32IFD-NEXT: fcvt.d.w fa5, zero +; RV32IFD-NEXT: fle.d a1, fa5, fa0 ; RV32IFD-NEXT: fld fa5, %lo(.LCPI3_0)(a0) -; RV32IFD-NEXT: fcvt.d.w fa4, zero -; RV32IFD-NEXT: fle.d a0, fa4, fa0 -; RV32IFD-NEXT: flt.d a1, fa5, fa0 -; RV32IFD-NEXT: neg s0, a1 -; RV32IFD-NEXT: neg s1, a0 +; RV32IFD-NEXT: flt.d a0, fa5, fa0 +; RV32IFD-NEXT: neg s0, a0 +; RV32IFD-NEXT: neg s1, a1 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 ; RV32IFD-NEXT: and a1, s1, a1 @@ -292,9 +292,9 @@ define i64 @test_ceil_si64(double %x) nounwind { ; RV32IFD-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill ; RV32IFD-NEXT: call ceil +; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: lui a0, %hi(.LCPI5_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI5_0)(a0) -; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi ; RV32IFD-NEXT: lui a3, 524288 @@ -347,9 +347,9 @@ define i64 @test_ceil_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI5_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI5_0)(a2) ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI5_0+4)(a2) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI5_1)(a3) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI5_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 @@ -429,12 +429,12 @@ define i64 @test_ceil_ui64(double %x) nounwind { ; RV32IFD-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: call ceil ; RV32IFD-NEXT: lui a0, %hi(.LCPI7_0) +; RV32IFD-NEXT: fcvt.d.w fa5, zero +; RV32IFD-NEXT: fle.d a1, fa5, fa0 ; RV32IFD-NEXT: fld fa5, %lo(.LCPI7_0)(a0) -; RV32IFD-NEXT: fcvt.d.w fa4, zero -; RV32IFD-NEXT: fle.d a0, fa4, fa0 -; RV32IFD-NEXT: flt.d a1, fa5, fa0 -; RV32IFD-NEXT: neg s0, a1 -; RV32IFD-NEXT: neg s1, a0 +; RV32IFD-NEXT: flt.d a0, fa5, fa0 +; RV32IFD-NEXT: neg s0, a0 +; RV32IFD-NEXT: neg s1, a1 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 ; RV32IFD-NEXT: and a1, s1, a1 @@ -536,9 +536,9 @@ define i64 @test_trunc_si64(double %x) nounwind { ; RV32IFD-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill ; RV32IFD-NEXT: call trunc +; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: lui a0, %hi(.LCPI9_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI9_0)(a0) -; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi ; RV32IFD-NEXT: lui a3, 524288 @@ -591,9 +591,9 @@ define i64 @test_trunc_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI9_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI9_0)(a2) ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI9_0+4)(a2) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI9_1)(a3) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI9_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 @@ -673,12 +673,12 @@ define i64 @test_trunc_ui64(double %x) nounwind { ; RV32IFD-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: call trunc ; RV32IFD-NEXT: lui a0, %hi(.LCPI11_0) +; RV32IFD-NEXT: fcvt.d.w fa5, zero +; RV32IFD-NEXT: fle.d a1, fa5, fa0 ; RV32IFD-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; RV32IFD-NEXT: fcvt.d.w fa4, zero -; RV32IFD-NEXT: fle.d a0, fa4, fa0 -; RV32IFD-NEXT: flt.d a1, fa5, fa0 -; RV32IFD-NEXT: neg s0, a1 -; RV32IFD-NEXT: neg s1, a0 +; RV32IFD-NEXT: flt.d a0, fa5, fa0 +; RV32IFD-NEXT: neg s0, a0 +; RV32IFD-NEXT: neg s1, a1 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 ; RV32IFD-NEXT: and a1, s1, a1 @@ -780,9 +780,9 @@ define i64 @test_round_si64(double %x) nounwind { ; RV32IFD-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill ; RV32IFD-NEXT: call round +; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: lui a0, %hi(.LCPI13_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi ; RV32IFD-NEXT: lui a3, 524288 @@ -835,9 +835,9 @@ define i64 @test_round_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI13_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI13_0)(a2) ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI13_0+4)(a2) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI13_1)(a3) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI13_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 @@ -917,12 +917,12 @@ define i64 @test_round_ui64(double %x) nounwind { ; RV32IFD-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: call round ; RV32IFD-NEXT: lui a0, %hi(.LCPI15_0) +; RV32IFD-NEXT: fcvt.d.w fa5, zero +; RV32IFD-NEXT: fle.d a1, fa5, fa0 ; RV32IFD-NEXT: fld fa5, %lo(.LCPI15_0)(a0) -; RV32IFD-NEXT: fcvt.d.w fa4, zero -; RV32IFD-NEXT: fle.d a0, fa4, fa0 -; RV32IFD-NEXT: flt.d a1, fa5, fa0 -; RV32IFD-NEXT: neg s0, a1 -; RV32IFD-NEXT: neg s1, a0 +; RV32IFD-NEXT: flt.d a0, fa5, fa0 +; RV32IFD-NEXT: neg s0, a0 +; RV32IFD-NEXT: neg s1, a1 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 ; RV32IFD-NEXT: and a1, s1, a1 @@ -1024,9 +1024,9 @@ define i64 @test_roundeven_si64(double %x) nounwind { ; RV32IFD-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill ; RV32IFD-NEXT: call roundeven +; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: lui a0, %hi(.LCPI17_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI17_0)(a0) -; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi ; RV32IFD-NEXT: lui a3, 524288 @@ -1079,9 +1079,9 @@ define i64 @test_roundeven_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI17_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI17_0)(a2) ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI17_0+4)(a2) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI17_1)(a3) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI17_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 @@ -1161,12 +1161,12 @@ define i64 @test_roundeven_ui64(double %x) nounwind { ; RV32IFD-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: call roundeven ; RV32IFD-NEXT: lui a0, %hi(.LCPI19_0) +; RV32IFD-NEXT: fcvt.d.w fa5, zero +; RV32IFD-NEXT: fle.d a1, fa5, fa0 ; RV32IFD-NEXT: fld fa5, %lo(.LCPI19_0)(a0) -; RV32IFD-NEXT: fcvt.d.w fa4, zero -; RV32IFD-NEXT: fle.d a0, fa4, fa0 -; RV32IFD-NEXT: flt.d a1, fa5, fa0 -; RV32IFD-NEXT: neg s0, a1 -; RV32IFD-NEXT: neg s1, a0 +; RV32IFD-NEXT: flt.d a0, fa5, fa0 +; RV32IFD-NEXT: neg s0, a0 +; RV32IFD-NEXT: neg s1, a1 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 ; RV32IFD-NEXT: and a1, s1, a1 @@ -1268,9 +1268,9 @@ define i64 @test_rint_si64(double %x) nounwind { ; RV32IFD-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill ; RV32IFD-NEXT: call rint +; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: lui a0, %hi(.LCPI21_0) ; RV32IFD-NEXT: fld fa5, %lo(.LCPI21_0)(a0) -; RV32IFD-NEXT: fmv.d fs0, fa0 ; RV32IFD-NEXT: fle.d s0, fa5, fa0 ; RV32IFD-NEXT: call __fixdfdi ; RV32IFD-NEXT: lui a3, 524288 @@ -1323,9 +1323,9 @@ define i64 @test_rint_si64(double %x) nounwind { ; RV32IZFINXZDINX-NEXT: lui a3, %hi(.LCPI21_1) ; RV32IZFINXZDINX-NEXT: lw a4, %lo(.LCPI21_0)(a2) ; RV32IZFINXZDINX-NEXT: lw a5, %lo(.LCPI21_0+4)(a2) +; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: lw a2, %lo(.LCPI21_1)(a3) ; RV32IZFINXZDINX-NEXT: lw a3, %lo(.LCPI21_1+4)(a3) -; RV32IZFINXZDINX-NEXT: fle.d a6, a4, s0 ; RV32IZFINXZDINX-NEXT: flt.d a3, a2, s0 ; RV32IZFINXZDINX-NEXT: feq.d a2, s0, s0 ; RV32IZFINXZDINX-NEXT: lui a4, 524288 @@ -1405,12 +1405,12 @@ define i64 @test_rint_ui64(double %x) nounwind { ; RV32IFD-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: call rint ; RV32IFD-NEXT: lui a0, %hi(.LCPI23_0) +; RV32IFD-NEXT: fcvt.d.w fa5, zero +; RV32IFD-NEXT: fle.d a1, fa5, fa0 ; RV32IFD-NEXT: fld fa5, %lo(.LCPI23_0)(a0) -; RV32IFD-NEXT: fcvt.d.w fa4, zero -; RV32IFD-NEXT: fle.d a0, fa4, fa0 -; RV32IFD-NEXT: flt.d a1, fa5, fa0 -; RV32IFD-NEXT: neg s0, a1 -; RV32IFD-NEXT: neg s1, a0 +; RV32IFD-NEXT: flt.d a0, fa5, fa0 +; RV32IFD-NEXT: neg s0, a0 +; RV32IFD-NEXT: neg s1, a1 ; RV32IFD-NEXT: call __fixunsdfdi ; RV32IFD-NEXT: and a0, s1, a0 ; RV32IFD-NEXT: and a1, s1, a1 diff --git a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll index e7ff991413013..10c417174e7fd 100644 --- a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll @@ -545,22 +545,22 @@ define i32 @i32_select_fcmp_oeq(double %a, double %b, i32 %c, i32 %d) nounwind { ; ; CHECKRV32ZDINX-LABEL: i32_select_fcmp_oeq: ; CHECKRV32ZDINX: # %bb.0: -; CHECKRV32ZDINX-NEXT: feq.d a1, a0, a2 -; CHECKRV32ZDINX-NEXT: mv a0, a4 -; CHECKRV32ZDINX-NEXT: bnez a1, .LBB16_2 +; CHECKRV32ZDINX-NEXT: feq.d a0, a0, a2 +; CHECKRV32ZDINX-NEXT: bnez a0, .LBB16_2 ; CHECKRV32ZDINX-NEXT: # %bb.1: -; CHECKRV32ZDINX-NEXT: mv a0, a5 +; CHECKRV32ZDINX-NEXT: mv a4, a5 ; CHECKRV32ZDINX-NEXT: .LBB16_2: +; CHECKRV32ZDINX-NEXT: mv a0, a4 ; CHECKRV32ZDINX-NEXT: ret ; ; CHECKRV64ZDINX-LABEL: i32_select_fcmp_oeq: ; CHECKRV64ZDINX: # %bb.0: -; CHECKRV64ZDINX-NEXT: feq.d a1, a0, a1 -; CHECKRV64ZDINX-NEXT: mv a0, a2 -; CHECKRV64ZDINX-NEXT: bnez a1, .LBB16_2 +; CHECKRV64ZDINX-NEXT: feq.d a0, a0, a1 +; CHECKRV64ZDINX-NEXT: bnez a0, .LBB16_2 ; CHECKRV64ZDINX-NEXT: # %bb.1: -; CHECKRV64ZDINX-NEXT: mv a0, a3 +; CHECKRV64ZDINX-NEXT: mv a2, a3 ; CHECKRV64ZDINX-NEXT: .LBB16_2: +; CHECKRV64ZDINX-NEXT: mv a0, a2 ; CHECKRV64ZDINX-NEXT: ret %1 = fcmp oeq double %a, %b %2 = select i1 %1, i32 %c, i32 %d @@ -577,9 +577,9 @@ define i32 @select_fcmp_oeq_1_2(double %a, double %b) { ; ; CHECKRV32ZDINX-LABEL: select_fcmp_oeq_1_2: ; CHECKRV32ZDINX: # %bb.0: -; CHECKRV32ZDINX-NEXT: li a4, 2 ; CHECKRV32ZDINX-NEXT: feq.d a0, a0, a2 -; CHECKRV32ZDINX-NEXT: sub a0, a4, a0 +; CHECKRV32ZDINX-NEXT: li a1, 2 +; CHECKRV32ZDINX-NEXT: sub a0, a1, a0 ; CHECKRV32ZDINX-NEXT: ret ; ; CHECKRV64ZDINX-LABEL: select_fcmp_oeq_1_2: diff --git a/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll b/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll index 4ae912a34d337..4478e7b8c1724 100644 --- a/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll +++ b/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll @@ -39,9 +39,9 @@ define double @func(double %d, i32 %n) nounwind { ; ; RV64IFD-LABEL: func: ; RV64IFD: # %bb.0: # %entry -; RV64IFD-NEXT: sext.w a2, a1 ; RV64IFD-NEXT: fmv.d.x fa5, a0 -; RV64IFD-NEXT: beqz a2, .LBB0_2 +; RV64IFD-NEXT: sext.w a0, a1 +; RV64IFD-NEXT: beqz a0, .LBB0_2 ; RV64IFD-NEXT: # %bb.1: # %if.else ; RV64IFD-NEXT: addi sp, sp, -16 ; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill diff --git a/llvm/test/CodeGen/RISCV/fastcc-bf16.ll b/llvm/test/CodeGen/RISCV/fastcc-bf16.ll index 17356116081ff..91577b96de6ba 100644 --- a/llvm/test/CodeGen/RISCV/fastcc-bf16.ll +++ b/llvm/test/CodeGen/RISCV/fastcc-bf16.ll @@ -23,6 +23,9 @@ define bfloat @caller(<32 x bfloat> %A) nounwind { ; CHECK-NEXT: fmv.h.x fa2, a2 ; CHECK-NEXT: fmv.h.x fa3, a3 ; CHECK-NEXT: fmv.h.x fa4, a4 +; CHECK-NEXT: fmv.h.x fa5, a5 +; CHECK-NEXT: fmv.h.x fa6, a6 +; CHECK-NEXT: fmv.h.x fa7, a7 ; CHECK-NEXT: flh ft0, 32(sp) ; CHECK-NEXT: flh ft1, 36(sp) ; CHECK-NEXT: flh ft2, 40(sp) @@ -47,9 +50,6 @@ define bfloat @caller(<32 x bfloat> %A) nounwind { ; CHECK-NEXT: flh fs9, 116(sp) ; CHECK-NEXT: flh fs10, 120(sp) ; CHECK-NEXT: flh fs11, 124(sp) -; CHECK-NEXT: fmv.h.x fa5, a5 -; CHECK-NEXT: fmv.h.x fa6, a6 -; CHECK-NEXT: fmv.h.x fa7, a7 ; CHECK-NEXT: fsh fs8, 16(sp) ; CHECK-NEXT: fsh fs9, 18(sp) ; CHECK-NEXT: fsh fs10, 20(sp) diff --git a/llvm/test/CodeGen/RISCV/fastcc-float.ll b/llvm/test/CodeGen/RISCV/fastcc-float.ll index 237a72d983de4..c1c5fc440d403 100644 --- a/llvm/test/CodeGen/RISCV/fastcc-float.ll +++ b/llvm/test/CodeGen/RISCV/fastcc-float.ll @@ -23,6 +23,9 @@ define float @caller(<32 x float> %A) nounwind { ; CHECK-NEXT: fmv.w.x fa2, a2 ; CHECK-NEXT: fmv.w.x fa3, a3 ; CHECK-NEXT: fmv.w.x fa4, a4 +; CHECK-NEXT: fmv.w.x fa5, a5 +; CHECK-NEXT: fmv.w.x fa6, a6 +; CHECK-NEXT: fmv.w.x fa7, a7 ; CHECK-NEXT: flw ft0, 64(sp) ; CHECK-NEXT: flw ft1, 68(sp) ; CHECK-NEXT: flw ft2, 72(sp) @@ -47,9 +50,6 @@ define float @caller(<32 x float> %A) nounwind { ; CHECK-NEXT: flw fs9, 148(sp) ; CHECK-NEXT: flw fs10, 152(sp) ; CHECK-NEXT: flw fs11, 156(sp) -; CHECK-NEXT: fmv.w.x fa5, a5 -; CHECK-NEXT: fmv.w.x fa6, a6 -; CHECK-NEXT: fmv.w.x fa7, a7 ; CHECK-NEXT: fsw fs8, 32(sp) ; CHECK-NEXT: fsw fs9, 36(sp) ; CHECK-NEXT: fsw fs10, 40(sp) diff --git a/llvm/test/CodeGen/RISCV/fastcc-half.ll b/llvm/test/CodeGen/RISCV/fastcc-half.ll index bf8d4e8dcb98c..b5c3f7ef8d523 100644 --- a/llvm/test/CodeGen/RISCV/fastcc-half.ll +++ b/llvm/test/CodeGen/RISCV/fastcc-half.ll @@ -23,6 +23,9 @@ define half @caller(<32 x half> %A) nounwind { ; CHECK-NEXT: fmv.h.x fa2, a2 ; CHECK-NEXT: fmv.h.x fa3, a3 ; CHECK-NEXT: fmv.h.x fa4, a4 +; CHECK-NEXT: fmv.h.x fa5, a5 +; CHECK-NEXT: fmv.h.x fa6, a6 +; CHECK-NEXT: fmv.h.x fa7, a7 ; CHECK-NEXT: flh ft0, 32(sp) ; CHECK-NEXT: flh ft1, 36(sp) ; CHECK-NEXT: flh ft2, 40(sp) @@ -47,9 +50,6 @@ define half @caller(<32 x half> %A) nounwind { ; CHECK-NEXT: flh fs9, 116(sp) ; CHECK-NEXT: flh fs10, 120(sp) ; CHECK-NEXT: flh fs11, 124(sp) -; CHECK-NEXT: fmv.h.x fa5, a5 -; CHECK-NEXT: fmv.h.x fa6, a6 -; CHECK-NEXT: fmv.h.x fa7, a7 ; CHECK-NEXT: fsh fs8, 16(sp) ; CHECK-NEXT: fsh fs9, 18(sp) ; CHECK-NEXT: fsh fs10, 20(sp) diff --git a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll index 8a91c46bcdaff..beb0df5f292be 100644 --- a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll +++ b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll @@ -287,6 +287,7 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX32-NEXT: mv a7, a3 ; ZHINX32-NEXT: mv a6, a2 ; ZHINX32-NEXT: mv a5, a1 +; ZHINX32-NEXT: mv a4, a0 ; ZHINX32-NEXT: lh t3, 112(sp) ; ZHINX32-NEXT: lh t4, 116(sp) ; ZHINX32-NEXT: lh t5, 120(sp) @@ -307,14 +308,14 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX32-NEXT: lh s10, 180(sp) ; ZHINX32-NEXT: lh s11, 184(sp) ; ZHINX32-NEXT: lh ra, 188(sp) -; ZHINX32-NEXT: lh a1, 192(sp) -; ZHINX32-NEXT: lh a2, 196(sp) -; ZHINX32-NEXT: lh a3, 200(sp) -; ZHINX32-NEXT: lh a4, 204(sp) -; ZHINX32-NEXT: sh a1, 32(sp) -; ZHINX32-NEXT: sh a2, 34(sp) -; ZHINX32-NEXT: sh a3, 36(sp) -; ZHINX32-NEXT: sh a4, 38(sp) +; ZHINX32-NEXT: lh a0, 192(sp) +; ZHINX32-NEXT: lh a1, 196(sp) +; ZHINX32-NEXT: lh a2, 200(sp) +; ZHINX32-NEXT: lh a3, 204(sp) +; ZHINX32-NEXT: sh a0, 32(sp) +; ZHINX32-NEXT: sh a1, 34(sp) +; ZHINX32-NEXT: sh a2, 36(sp) +; ZHINX32-NEXT: sh a3, 38(sp) ; ZHINX32-NEXT: sh s9, 24(sp) ; ZHINX32-NEXT: sh s10, 26(sp) ; ZHINX32-NEXT: sh s11, 28(sp) @@ -331,6 +332,7 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX32-NEXT: sh t1, 2(sp) ; ZHINX32-NEXT: sh t2, 4(sp) ; ZHINX32-NEXT: sh s0, 6(sp) +; ZHINX32-NEXT: mv a0, a4 ; ZHINX32-NEXT: mv a1, a5 ; ZHINX32-NEXT: mv a2, a6 ; ZHINX32-NEXT: mv a3, a7 @@ -378,6 +380,7 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX64-NEXT: mv a7, a3 ; ZHINX64-NEXT: mv a6, a2 ; ZHINX64-NEXT: mv a5, a1 +; ZHINX64-NEXT: mv a4, a0 ; ZHINX64-NEXT: lh t3, 160(sp) ; ZHINX64-NEXT: lh t4, 168(sp) ; ZHINX64-NEXT: lh t5, 176(sp) @@ -398,14 +401,14 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX64-NEXT: lh s10, 296(sp) ; ZHINX64-NEXT: lh s11, 304(sp) ; ZHINX64-NEXT: lh ra, 312(sp) -; ZHINX64-NEXT: lh a1, 320(sp) -; ZHINX64-NEXT: lh a2, 328(sp) -; ZHINX64-NEXT: lh a3, 336(sp) -; ZHINX64-NEXT: lh a4, 344(sp) -; ZHINX64-NEXT: sh a1, 32(sp) -; ZHINX64-NEXT: sh a2, 34(sp) -; ZHINX64-NEXT: sh a3, 36(sp) -; ZHINX64-NEXT: sh a4, 38(sp) +; ZHINX64-NEXT: lh a0, 320(sp) +; ZHINX64-NEXT: lh a1, 328(sp) +; ZHINX64-NEXT: lh a2, 336(sp) +; ZHINX64-NEXT: lh a3, 344(sp) +; ZHINX64-NEXT: sh a0, 32(sp) +; ZHINX64-NEXT: sh a1, 34(sp) +; ZHINX64-NEXT: sh a2, 36(sp) +; ZHINX64-NEXT: sh a3, 38(sp) ; ZHINX64-NEXT: sh s9, 24(sp) ; ZHINX64-NEXT: sh s10, 26(sp) ; ZHINX64-NEXT: sh s11, 28(sp) @@ -422,6 +425,7 @@ define half @caller_half_32(<32 x half> %A) nounwind { ; ZHINX64-NEXT: sh t1, 2(sp) ; ZHINX64-NEXT: sh t2, 4(sp) ; ZHINX64-NEXT: sh s0, 6(sp) +; ZHINX64-NEXT: mv a0, a4 ; ZHINX64-NEXT: mv a1, a5 ; ZHINX64-NEXT: mv a2, a6 ; ZHINX64-NEXT: mv a3, a7 @@ -893,6 +897,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX32-NEXT: mv a7, a3 ; ZHINX32-NEXT: mv a6, a2 ; ZHINX32-NEXT: mv a5, a1 +; ZHINX32-NEXT: mv a4, a0 ; ZHINX32-NEXT: lw t3, 160(sp) ; ZHINX32-NEXT: lw t4, 164(sp) ; ZHINX32-NEXT: lw t5, 168(sp) @@ -913,14 +918,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX32-NEXT: lw s10, 228(sp) ; ZHINX32-NEXT: lw s11, 232(sp) ; ZHINX32-NEXT: lw ra, 236(sp) -; ZHINX32-NEXT: lw a1, 240(sp) -; ZHINX32-NEXT: lw a2, 244(sp) -; ZHINX32-NEXT: lw a3, 248(sp) -; ZHINX32-NEXT: lw a4, 252(sp) -; ZHINX32-NEXT: sw a1, 64(sp) -; ZHINX32-NEXT: sw a2, 68(sp) -; ZHINX32-NEXT: sw a3, 72(sp) -; ZHINX32-NEXT: sw a4, 76(sp) +; ZHINX32-NEXT: lw a0, 240(sp) +; ZHINX32-NEXT: lw a1, 244(sp) +; ZHINX32-NEXT: lw a2, 248(sp) +; ZHINX32-NEXT: lw a3, 252(sp) +; ZHINX32-NEXT: sw a0, 64(sp) +; ZHINX32-NEXT: sw a1, 68(sp) +; ZHINX32-NEXT: sw a2, 72(sp) +; ZHINX32-NEXT: sw a3, 76(sp) ; ZHINX32-NEXT: sw s9, 48(sp) ; ZHINX32-NEXT: sw s10, 52(sp) ; ZHINX32-NEXT: sw s11, 56(sp) @@ -937,6 +942,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX32-NEXT: sw t1, 4(sp) ; ZHINX32-NEXT: sw t2, 8(sp) ; ZHINX32-NEXT: sw s0, 12(sp) +; ZHINX32-NEXT: mv a0, a4 ; ZHINX32-NEXT: mv a1, a5 ; ZHINX32-NEXT: mv a2, a6 ; ZHINX32-NEXT: mv a3, a7 @@ -984,6 +990,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX64-NEXT: mv a7, a3 ; ZHINX64-NEXT: mv a6, a2 ; ZHINX64-NEXT: mv a5, a1 +; ZHINX64-NEXT: mv a4, a0 ; ZHINX64-NEXT: lw t3, 208(sp) ; ZHINX64-NEXT: lw t4, 216(sp) ; ZHINX64-NEXT: lw t5, 224(sp) @@ -1004,14 +1011,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX64-NEXT: lw s10, 344(sp) ; ZHINX64-NEXT: lw s11, 352(sp) ; ZHINX64-NEXT: lw ra, 360(sp) -; ZHINX64-NEXT: lw a1, 368(sp) -; ZHINX64-NEXT: lw a2, 376(sp) -; ZHINX64-NEXT: lw a3, 384(sp) -; ZHINX64-NEXT: lw a4, 392(sp) -; ZHINX64-NEXT: sw a1, 64(sp) -; ZHINX64-NEXT: sw a2, 68(sp) -; ZHINX64-NEXT: sw a3, 72(sp) -; ZHINX64-NEXT: sw a4, 76(sp) +; ZHINX64-NEXT: lw a0, 368(sp) +; ZHINX64-NEXT: lw a1, 376(sp) +; ZHINX64-NEXT: lw a2, 384(sp) +; ZHINX64-NEXT: lw a3, 392(sp) +; ZHINX64-NEXT: sw a0, 64(sp) +; ZHINX64-NEXT: sw a1, 68(sp) +; ZHINX64-NEXT: sw a2, 72(sp) +; ZHINX64-NEXT: sw a3, 76(sp) ; ZHINX64-NEXT: sw s9, 48(sp) ; ZHINX64-NEXT: sw s10, 52(sp) ; ZHINX64-NEXT: sw s11, 56(sp) @@ -1028,6 +1035,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZHINX64-NEXT: sw t1, 4(sp) ; ZHINX64-NEXT: sw t2, 8(sp) ; ZHINX64-NEXT: sw s0, 12(sp) +; ZHINX64-NEXT: mv a0, a4 ; ZHINX64-NEXT: mv a1, a5 ; ZHINX64-NEXT: mv a2, a6 ; ZHINX64-NEXT: mv a3, a7 @@ -1075,6 +1083,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX32-NEXT: mv a7, a3 ; ZFINX32-NEXT: mv a6, a2 ; ZFINX32-NEXT: mv a5, a1 +; ZFINX32-NEXT: mv a4, a0 ; ZFINX32-NEXT: lw t3, 160(sp) ; ZFINX32-NEXT: lw t4, 164(sp) ; ZFINX32-NEXT: lw t5, 168(sp) @@ -1095,14 +1104,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX32-NEXT: lw s10, 228(sp) ; ZFINX32-NEXT: lw s11, 232(sp) ; ZFINX32-NEXT: lw ra, 236(sp) -; ZFINX32-NEXT: lw a1, 240(sp) -; ZFINX32-NEXT: lw a2, 244(sp) -; ZFINX32-NEXT: lw a3, 248(sp) -; ZFINX32-NEXT: lw a4, 252(sp) -; ZFINX32-NEXT: sw a1, 64(sp) -; ZFINX32-NEXT: sw a2, 68(sp) -; ZFINX32-NEXT: sw a3, 72(sp) -; ZFINX32-NEXT: sw a4, 76(sp) +; ZFINX32-NEXT: lw a0, 240(sp) +; ZFINX32-NEXT: lw a1, 244(sp) +; ZFINX32-NEXT: lw a2, 248(sp) +; ZFINX32-NEXT: lw a3, 252(sp) +; ZFINX32-NEXT: sw a0, 64(sp) +; ZFINX32-NEXT: sw a1, 68(sp) +; ZFINX32-NEXT: sw a2, 72(sp) +; ZFINX32-NEXT: sw a3, 76(sp) ; ZFINX32-NEXT: sw s9, 48(sp) ; ZFINX32-NEXT: sw s10, 52(sp) ; ZFINX32-NEXT: sw s11, 56(sp) @@ -1119,6 +1128,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX32-NEXT: sw t1, 4(sp) ; ZFINX32-NEXT: sw t2, 8(sp) ; ZFINX32-NEXT: sw s0, 12(sp) +; ZFINX32-NEXT: mv a0, a4 ; ZFINX32-NEXT: mv a1, a5 ; ZFINX32-NEXT: mv a2, a6 ; ZFINX32-NEXT: mv a3, a7 @@ -1166,6 +1176,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX64-NEXT: mv a7, a3 ; ZFINX64-NEXT: mv a6, a2 ; ZFINX64-NEXT: mv a5, a1 +; ZFINX64-NEXT: mv a4, a0 ; ZFINX64-NEXT: lw t3, 208(sp) ; ZFINX64-NEXT: lw t4, 216(sp) ; ZFINX64-NEXT: lw t5, 224(sp) @@ -1186,14 +1197,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX64-NEXT: lw s10, 344(sp) ; ZFINX64-NEXT: lw s11, 352(sp) ; ZFINX64-NEXT: lw ra, 360(sp) -; ZFINX64-NEXT: lw a1, 368(sp) -; ZFINX64-NEXT: lw a2, 376(sp) -; ZFINX64-NEXT: lw a3, 384(sp) -; ZFINX64-NEXT: lw a4, 392(sp) -; ZFINX64-NEXT: sw a1, 64(sp) -; ZFINX64-NEXT: sw a2, 68(sp) -; ZFINX64-NEXT: sw a3, 72(sp) -; ZFINX64-NEXT: sw a4, 76(sp) +; ZFINX64-NEXT: lw a0, 368(sp) +; ZFINX64-NEXT: lw a1, 376(sp) +; ZFINX64-NEXT: lw a2, 384(sp) +; ZFINX64-NEXT: lw a3, 392(sp) +; ZFINX64-NEXT: sw a0, 64(sp) +; ZFINX64-NEXT: sw a1, 68(sp) +; ZFINX64-NEXT: sw a2, 72(sp) +; ZFINX64-NEXT: sw a3, 76(sp) ; ZFINX64-NEXT: sw s9, 48(sp) ; ZFINX64-NEXT: sw s10, 52(sp) ; ZFINX64-NEXT: sw s11, 56(sp) @@ -1210,6 +1221,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZFINX64-NEXT: sw t1, 4(sp) ; ZFINX64-NEXT: sw t2, 8(sp) ; ZFINX64-NEXT: sw s0, 12(sp) +; ZFINX64-NEXT: mv a0, a4 ; ZFINX64-NEXT: mv a1, a5 ; ZFINX64-NEXT: mv a2, a6 ; ZFINX64-NEXT: mv a3, a7 @@ -1257,6 +1269,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX32-NEXT: mv a7, a3 ; ZDINX32-NEXT: mv a6, a2 ; ZDINX32-NEXT: mv a5, a1 +; ZDINX32-NEXT: mv a4, a0 ; ZDINX32-NEXT: lw t3, 160(sp) ; ZDINX32-NEXT: lw t4, 164(sp) ; ZDINX32-NEXT: lw t5, 168(sp) @@ -1277,14 +1290,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX32-NEXT: lw s10, 228(sp) ; ZDINX32-NEXT: lw s11, 232(sp) ; ZDINX32-NEXT: lw ra, 236(sp) -; ZDINX32-NEXT: lw a1, 240(sp) -; ZDINX32-NEXT: lw a2, 244(sp) -; ZDINX32-NEXT: lw a3, 248(sp) -; ZDINX32-NEXT: lw a4, 252(sp) -; ZDINX32-NEXT: sw a1, 64(sp) -; ZDINX32-NEXT: sw a2, 68(sp) -; ZDINX32-NEXT: sw a3, 72(sp) -; ZDINX32-NEXT: sw a4, 76(sp) +; ZDINX32-NEXT: lw a0, 240(sp) +; ZDINX32-NEXT: lw a1, 244(sp) +; ZDINX32-NEXT: lw a2, 248(sp) +; ZDINX32-NEXT: lw a3, 252(sp) +; ZDINX32-NEXT: sw a0, 64(sp) +; ZDINX32-NEXT: sw a1, 68(sp) +; ZDINX32-NEXT: sw a2, 72(sp) +; ZDINX32-NEXT: sw a3, 76(sp) ; ZDINX32-NEXT: sw s9, 48(sp) ; ZDINX32-NEXT: sw s10, 52(sp) ; ZDINX32-NEXT: sw s11, 56(sp) @@ -1301,6 +1314,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX32-NEXT: sw t1, 4(sp) ; ZDINX32-NEXT: sw t2, 8(sp) ; ZDINX32-NEXT: sw s0, 12(sp) +; ZDINX32-NEXT: mv a0, a4 ; ZDINX32-NEXT: mv a1, a5 ; ZDINX32-NEXT: mv a2, a6 ; ZDINX32-NEXT: mv a3, a7 @@ -1348,6 +1362,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX64-NEXT: mv a7, a3 ; ZDINX64-NEXT: mv a6, a2 ; ZDINX64-NEXT: mv a5, a1 +; ZDINX64-NEXT: mv a4, a0 ; ZDINX64-NEXT: lw t3, 208(sp) ; ZDINX64-NEXT: lw t4, 216(sp) ; ZDINX64-NEXT: lw t5, 224(sp) @@ -1368,14 +1383,14 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX64-NEXT: lw s10, 344(sp) ; ZDINX64-NEXT: lw s11, 352(sp) ; ZDINX64-NEXT: lw ra, 360(sp) -; ZDINX64-NEXT: lw a1, 368(sp) -; ZDINX64-NEXT: lw a2, 376(sp) -; ZDINX64-NEXT: lw a3, 384(sp) -; ZDINX64-NEXT: lw a4, 392(sp) -; ZDINX64-NEXT: sw a1, 64(sp) -; ZDINX64-NEXT: sw a2, 68(sp) -; ZDINX64-NEXT: sw a3, 72(sp) -; ZDINX64-NEXT: sw a4, 76(sp) +; ZDINX64-NEXT: lw a0, 368(sp) +; ZDINX64-NEXT: lw a1, 376(sp) +; ZDINX64-NEXT: lw a2, 384(sp) +; ZDINX64-NEXT: lw a3, 392(sp) +; ZDINX64-NEXT: sw a0, 64(sp) +; ZDINX64-NEXT: sw a1, 68(sp) +; ZDINX64-NEXT: sw a2, 72(sp) +; ZDINX64-NEXT: sw a3, 76(sp) ; ZDINX64-NEXT: sw s9, 48(sp) ; ZDINX64-NEXT: sw s10, 52(sp) ; ZDINX64-NEXT: sw s11, 56(sp) @@ -1392,6 +1407,7 @@ define float @caller_float_32(<32 x float> %A) nounwind { ; ZDINX64-NEXT: sw t1, 4(sp) ; ZDINX64-NEXT: sw t2, 8(sp) ; ZDINX64-NEXT: sw s0, 12(sp) +; ZDINX64-NEXT: mv a0, a4 ; ZDINX64-NEXT: mv a1, a5 ; ZDINX64-NEXT: mv a2, a6 ; ZDINX64-NEXT: mv a3, a7 diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll index fc866d71a3a70..89858af3282d6 100644 --- a/llvm/test/CodeGen/RISCV/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/float-convert.ll @@ -1417,13 +1417,13 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind { ; RV32IF-LABEL: fcvt_w_s_sat_i16: ; RV32IF: # %bb.0: # %start ; RV32IF-NEXT: feq.s a0, fa0, fa0 -; RV32IF-NEXT: lui a1, %hi(.LCPI24_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI24_0)(a1) ; RV32IF-NEXT: lui a1, 815104 -; RV32IF-NEXT: fmv.w.x fa4, a1 -; RV32IF-NEXT: fmax.s fa4, fa0, fa4 +; RV32IF-NEXT: fmv.w.x fa5, a1 +; RV32IF-NEXT: lui a1, %hi(.LCPI24_0) ; RV32IF-NEXT: neg a0, a0 -; RV32IF-NEXT: fmin.s fa5, fa4, fa5 +; RV32IF-NEXT: fmax.s fa5, fa0, fa5 +; RV32IF-NEXT: flw fa4, %lo(.LCPI24_0)(a1) +; RV32IF-NEXT: fmin.s fa5, fa5, fa4 ; RV32IF-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IF-NEXT: and a0, a0, a1 ; RV32IF-NEXT: ret @@ -1431,13 +1431,13 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind { ; RV64IF-LABEL: fcvt_w_s_sat_i16: ; RV64IF: # %bb.0: # %start ; RV64IF-NEXT: feq.s a0, fa0, fa0 -; RV64IF-NEXT: lui a1, %hi(.LCPI24_0) -; RV64IF-NEXT: flw fa5, %lo(.LCPI24_0)(a1) ; RV64IF-NEXT: lui a1, 815104 -; RV64IF-NEXT: fmv.w.x fa4, a1 -; RV64IF-NEXT: fmax.s fa4, fa0, fa4 +; RV64IF-NEXT: fmv.w.x fa5, a1 +; RV64IF-NEXT: lui a1, %hi(.LCPI24_0) ; RV64IF-NEXT: neg a0, a0 -; RV64IF-NEXT: fmin.s fa5, fa4, fa5 +; RV64IF-NEXT: fmax.s fa5, fa0, fa5 +; RV64IF-NEXT: flw fa4, %lo(.LCPI24_0)(a1) +; RV64IF-NEXT: fmin.s fa5, fa5, fa4 ; RV64IF-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret @@ -1602,21 +1602,21 @@ define zeroext i16 @fcvt_wu_s_i16(float %a) nounwind { define zeroext i16 @fcvt_wu_s_sat_i16(float %a) nounwind { ; RV32IF-LABEL: fcvt_wu_s_sat_i16: ; RV32IF: # %bb.0: # %start +; RV32IF-NEXT: fmv.w.x fa5, zero ; RV32IF-NEXT: lui a0, %hi(.LCPI26_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI26_0)(a0) -; RV32IF-NEXT: fmv.w.x fa4, zero -; RV32IF-NEXT: fmax.s fa4, fa0, fa4 -; RV32IF-NEXT: fmin.s fa5, fa4, fa5 +; RV32IF-NEXT: fmax.s fa5, fa0, fa5 +; RV32IF-NEXT: flw fa4, %lo(.LCPI26_0)(a0) +; RV32IF-NEXT: fmin.s fa5, fa5, fa4 ; RV32IF-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32IF-NEXT: ret ; ; RV64IF-LABEL: fcvt_wu_s_sat_i16: ; RV64IF: # %bb.0: # %start +; RV64IF-NEXT: fmv.w.x fa5, zero ; RV64IF-NEXT: lui a0, %hi(.LCPI26_0) -; RV64IF-NEXT: flw fa5, %lo(.LCPI26_0)(a0) -; RV64IF-NEXT: fmv.w.x fa4, zero -; RV64IF-NEXT: fmax.s fa4, fa0, fa4 -; RV64IF-NEXT: fmin.s fa5, fa4, fa5 +; RV64IF-NEXT: fmax.s fa5, fa0, fa5 +; RV64IF-NEXT: flw fa4, %lo(.LCPI26_0)(a0) +; RV64IF-NEXT: fmin.s fa5, fa5, fa4 ; RV64IF-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64IF-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll index 0cbfc96bf485e..9b3a643e59e68 100644 --- a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll +++ b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll @@ -234,8 +234,8 @@ define i32 @fcmp_one(float %a, float %b) nounwind strictfp { ; CHECKIF-NEXT: frflags a0 ; CHECKIF-NEXT: flt.s a2, fa1, fa0 ; CHECKIF-NEXT: fsflags a0 -; CHECKIF-NEXT: or a0, a2, a1 ; CHECKIF-NEXT: feq.s zero, fa1, fa0 +; CHECKIF-NEXT: or a0, a2, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: fcmp_one: @@ -247,9 +247,8 @@ define i32 @fcmp_one(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: frflags a2 ; CHECKIZFINX-NEXT: flt.s a4, a1, a0 ; CHECKIZFINX-NEXT: fsflags a2 -; CHECKIZFINX-NEXT: or a2, a4, a3 ; CHECKIZFINX-NEXT: feq.s zero, a1, a0 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: or a0, a4, a3 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_one: @@ -353,9 +352,9 @@ define i32 @fcmp_ueq(float %a, float %b) nounwind strictfp { ; CHECKIF-NEXT: frflags a0 ; CHECKIF-NEXT: flt.s a2, fa1, fa0 ; CHECKIF-NEXT: fsflags a0 +; CHECKIF-NEXT: feq.s zero, fa1, fa0 ; CHECKIF-NEXT: or a1, a2, a1 ; CHECKIF-NEXT: xori a0, a1, 1 -; CHECKIF-NEXT: feq.s zero, fa1, fa0 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: fcmp_ueq: @@ -367,10 +366,9 @@ define i32 @fcmp_ueq(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: frflags a2 ; CHECKIZFINX-NEXT: flt.s a4, a1, a0 ; CHECKIZFINX-NEXT: fsflags a2 -; CHECKIZFINX-NEXT: or a3, a4, a3 -; CHECKIZFINX-NEXT: xori a2, a3, 1 ; CHECKIZFINX-NEXT: feq.s zero, a1, a0 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: or a3, a4, a3 +; CHECKIZFINX-NEXT: xori a0, a3, 1 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ueq: @@ -429,8 +427,8 @@ define i32 @fcmp_ugt(float %a, float %b) nounwind strictfp { ; CHECKIF-NEXT: frflags a0 ; CHECKIF-NEXT: fle.s a1, fa0, fa1 ; CHECKIF-NEXT: fsflags a0 -; CHECKIF-NEXT: xori a0, a1, 1 ; CHECKIF-NEXT: feq.s zero, fa0, fa1 +; CHECKIF-NEXT: xori a0, a1, 1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: fcmp_ugt: @@ -438,9 +436,8 @@ define i32 @fcmp_ugt(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: frflags a2 ; CHECKIZFINX-NEXT: fle.s a3, a0, a1 ; CHECKIZFINX-NEXT: fsflags a2 -; CHECKIZFINX-NEXT: xori a2, a3, 1 ; CHECKIZFINX-NEXT: feq.s zero, a0, a1 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: xori a0, a3, 1 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ugt: @@ -473,8 +470,8 @@ define i32 @fcmp_uge(float %a, float %b) nounwind strictfp { ; CHECKIF-NEXT: frflags a0 ; CHECKIF-NEXT: flt.s a1, fa0, fa1 ; CHECKIF-NEXT: fsflags a0 -; CHECKIF-NEXT: xori a0, a1, 1 ; CHECKIF-NEXT: feq.s zero, fa0, fa1 +; CHECKIF-NEXT: xori a0, a1, 1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: fcmp_uge: @@ -482,9 +479,8 @@ define i32 @fcmp_uge(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: frflags a2 ; CHECKIZFINX-NEXT: flt.s a3, a0, a1 ; CHECKIZFINX-NEXT: fsflags a2 -; CHECKIZFINX-NEXT: xori a2, a3, 1 ; CHECKIZFINX-NEXT: feq.s zero, a0, a1 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: xori a0, a3, 1 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_uge: @@ -519,8 +515,8 @@ define i32 @fcmp_ult(float %a, float %b) nounwind strictfp { ; CHECKIF-NEXT: frflags a0 ; CHECKIF-NEXT: fle.s a1, fa1, fa0 ; CHECKIF-NEXT: fsflags a0 -; CHECKIF-NEXT: xori a0, a1, 1 ; CHECKIF-NEXT: feq.s zero, fa1, fa0 +; CHECKIF-NEXT: xori a0, a1, 1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: fcmp_ult: @@ -528,9 +524,8 @@ define i32 @fcmp_ult(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: frflags a2 ; CHECKIZFINX-NEXT: fle.s a3, a1, a0 ; CHECKIZFINX-NEXT: fsflags a2 -; CHECKIZFINX-NEXT: xori a2, a3, 1 ; CHECKIZFINX-NEXT: feq.s zero, a1, a0 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: xori a0, a3, 1 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ult: @@ -563,8 +558,8 @@ define i32 @fcmp_ule(float %a, float %b) nounwind strictfp { ; CHECKIF-NEXT: frflags a0 ; CHECKIF-NEXT: flt.s a1, fa1, fa0 ; CHECKIF-NEXT: fsflags a0 -; CHECKIF-NEXT: xori a0, a1, 1 ; CHECKIF-NEXT: feq.s zero, fa1, fa0 +; CHECKIF-NEXT: xori a0, a1, 1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: fcmp_ule: @@ -572,9 +567,8 @@ define i32 @fcmp_ule(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: frflags a2 ; CHECKIZFINX-NEXT: flt.s a3, a1, a0 ; CHECKIZFINX-NEXT: fsflags a2 -; CHECKIZFINX-NEXT: xori a2, a3, 1 ; CHECKIZFINX-NEXT: feq.s zero, a1, a0 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: xori a0, a3, 1 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ule: diff --git a/llvm/test/CodeGen/RISCV/float-select-fcmp.ll b/llvm/test/CodeGen/RISCV/float-select-fcmp.ll index a2ff0d33e2d31..5ec0335972394 100644 --- a/llvm/test/CodeGen/RISCV/float-select-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/float-select-fcmp.ll @@ -387,12 +387,12 @@ define i32 @i32_select_fcmp_oeq(float %a, float %b, i32 %c, i32 %d) nounwind { ; ; CHECKZFINX-LABEL: i32_select_fcmp_oeq: ; CHECKZFINX: # %bb.0: -; CHECKZFINX-NEXT: feq.s a1, a0, a1 -; CHECKZFINX-NEXT: mv a0, a2 -; CHECKZFINX-NEXT: bnez a1, .LBB16_2 +; CHECKZFINX-NEXT: feq.s a0, a0, a1 +; CHECKZFINX-NEXT: bnez a0, .LBB16_2 ; CHECKZFINX-NEXT: # %bb.1: -; CHECKZFINX-NEXT: mv a0, a3 +; CHECKZFINX-NEXT: mv a2, a3 ; CHECKZFINX-NEXT: .LBB16_2: +; CHECKZFINX-NEXT: mv a0, a2 ; CHECKZFINX-NEXT: ret %1 = fcmp oeq float %a, %b %2 = select i1 %1, i32 %c, i32 %d diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll index b8dc7804c4908..59ba3652c89e9 100644 --- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll +++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll @@ -929,19 +929,19 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call f +; RV32I-NEXT: addi s5, s5, 1 +; RV32I-NEXT: seqz a0, s5 +; RV32I-NEXT: add s6, s6, a0 ; RV32I-NEXT: lw a0, 8(s7) ; RV32I-NEXT: lw a1, 12(s7) -; RV32I-NEXT: addi s5, s5, 1 -; RV32I-NEXT: seqz a2, s5 -; RV32I-NEXT: add s6, s6, a2 -; RV32I-NEXT: xor a2, s5, s2 ; RV32I-NEXT: add a1, a1, s4 -; RV32I-NEXT: xor a3, s6, s1 -; RV32I-NEXT: or a2, a2, a3 +; RV32I-NEXT: xor a2, s5, s2 ; RV32I-NEXT: add s3, a0, s3 ; RV32I-NEXT: sltu s4, s3, a0 ; RV32I-NEXT: add s4, a1, s4 -; RV32I-NEXT: bnez a2, .LBB20_5 +; RV32I-NEXT: xor a0, s6, s1 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: bnez a0, .LBB20_5 ; RV32I-NEXT: .LBB20_6: # %for.cond.cleanup ; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: mv a1, s4 @@ -994,19 +994,19 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV32I-MEDIUM-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-MEDIUM-NEXT: mv a0, s0 ; RV32I-MEDIUM-NEXT: call f +; RV32I-MEDIUM-NEXT: addi s5, s5, 1 +; RV32I-MEDIUM-NEXT: seqz a0, s5 +; RV32I-MEDIUM-NEXT: add s6, s6, a0 ; RV32I-MEDIUM-NEXT: lw a0, 8(s7) ; RV32I-MEDIUM-NEXT: lw a1, 12(s7) -; RV32I-MEDIUM-NEXT: addi s5, s5, 1 -; RV32I-MEDIUM-NEXT: seqz a2, s5 -; RV32I-MEDIUM-NEXT: add s6, s6, a2 -; RV32I-MEDIUM-NEXT: xor a2, s5, s2 ; RV32I-MEDIUM-NEXT: add a1, a1, s4 -; RV32I-MEDIUM-NEXT: xor a3, s6, s1 -; RV32I-MEDIUM-NEXT: or a2, a2, a3 +; RV32I-MEDIUM-NEXT: xor a2, s5, s2 ; RV32I-MEDIUM-NEXT: add s3, a0, s3 ; RV32I-MEDIUM-NEXT: sltu s4, s3, a0 ; RV32I-MEDIUM-NEXT: add s4, a1, s4 -; RV32I-MEDIUM-NEXT: bnez a2, .LBB20_5 +; RV32I-MEDIUM-NEXT: xor a0, s6, s1 +; RV32I-MEDIUM-NEXT: or a0, a2, a0 +; RV32I-MEDIUM-NEXT: bnez a0, .LBB20_5 ; RV32I-MEDIUM-NEXT: .LBB20_6: # %for.cond.cleanup ; RV32I-MEDIUM-NEXT: mv a0, s3 ; RV32I-MEDIUM-NEXT: mv a1, s4 @@ -1042,8 +1042,8 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call f ; RV64I-NEXT: ld a0, 8(s3) -; RV64I-NEXT: addi s1, s1, -1 ; RV64I-NEXT: add s2, a0, s2 +; RV64I-NEXT: addi s1, s1, -1 ; RV64I-NEXT: bnez s1, .LBB20_2 ; RV64I-NEXT: j .LBB20_4 ; RV64I-NEXT: .LBB20_3: @@ -1078,8 +1078,8 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV64I-MEDIUM-NEXT: mv a0, s0 ; RV64I-MEDIUM-NEXT: call f ; RV64I-MEDIUM-NEXT: ld a0, 8(s3) -; RV64I-MEDIUM-NEXT: addi s1, s1, -1 ; RV64I-MEDIUM-NEXT: add s2, a0, s2 +; RV64I-MEDIUM-NEXT: addi s1, s1, -1 ; RV64I-MEDIUM-NEXT: bnez s1, .LBB20_2 ; RV64I-MEDIUM-NEXT: j .LBB20_4 ; RV64I-MEDIUM-NEXT: .LBB20_3: @@ -1108,18 +1108,18 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind { ; RV64I-LARGE-NEXT: mv s0, a2 ; RV64I-LARGE-NEXT: mv s1, a1 ; RV64I-LARGE-NEXT: li s2, 0 +; RV64I-LARGE-NEXT: slli a0, a0, 4 ; RV64I-LARGE-NEXT: .Lpcrel_hi14: ; RV64I-LARGE-NEXT: auipc a1, %pcrel_hi(.LCPI20_0) -; RV64I-LARGE-NEXT: ld s3, %pcrel_lo(.Lpcrel_hi14)(a1) -; RV64I-LARGE-NEXT: slli a0, a0, 4 -; RV64I-LARGE-NEXT: add s4, a2, a0 +; RV64I-LARGE-NEXT: add s3, a2, a0 +; RV64I-LARGE-NEXT: ld s4, %pcrel_lo(.Lpcrel_hi14)(a1) ; RV64I-LARGE-NEXT: .LBB20_2: # %for.body ; RV64I-LARGE-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64I-LARGE-NEXT: mv a0, s0 -; RV64I-LARGE-NEXT: jalr s3 -; RV64I-LARGE-NEXT: ld a0, 8(s4) -; RV64I-LARGE-NEXT: addi s1, s1, -1 +; RV64I-LARGE-NEXT: jalr s4 +; RV64I-LARGE-NEXT: ld a0, 8(s3) ; RV64I-LARGE-NEXT: add s2, a0, s2 +; RV64I-LARGE-NEXT: addi s1, s1, -1 ; RV64I-LARGE-NEXT: bnez s1, .LBB20_2 ; RV64I-LARGE-NEXT: j .LBB20_4 ; RV64I-LARGE-NEXT: .LBB20_3: diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll index e7719dc70660b..3ea9c4c6ad754 100644 --- a/llvm/test/CodeGen/RISCV/forced-atomics.ll +++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll @@ -1357,28 +1357,28 @@ define i32 @rmw32_max_seq_cst(ptr %p) nounwind { ; RV32-NO-ATOMIC-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NO-ATOMIC-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NO-ATOMIC-NEXT: mv s0, a0 -; RV32-NO-ATOMIC-NEXT: lw a1, 0(a0) +; RV32-NO-ATOMIC-NEXT: lw a0, 0(a0) ; RV32-NO-ATOMIC-NEXT: j .LBB23_2 ; RV32-NO-ATOMIC-NEXT: .LBB23_1: # %atomicrmw.start ; RV32-NO-ATOMIC-NEXT: # in Loop: Header=BB23_2 Depth=1 -; RV32-NO-ATOMIC-NEXT: sw a1, 4(sp) +; RV32-NO-ATOMIC-NEXT: sw a0, 4(sp) ; RV32-NO-ATOMIC-NEXT: addi a1, sp, 4 ; RV32-NO-ATOMIC-NEXT: li a3, 5 ; RV32-NO-ATOMIC-NEXT: li a4, 5 ; RV32-NO-ATOMIC-NEXT: mv a0, s0 ; RV32-NO-ATOMIC-NEXT: call __atomic_compare_exchange_4 -; RV32-NO-ATOMIC-NEXT: lw a1, 4(sp) -; RV32-NO-ATOMIC-NEXT: bnez a0, .LBB23_4 +; RV32-NO-ATOMIC-NEXT: mv a1, a0 +; RV32-NO-ATOMIC-NEXT: lw a0, 4(sp) +; RV32-NO-ATOMIC-NEXT: bnez a1, .LBB23_4 ; RV32-NO-ATOMIC-NEXT: .LBB23_2: # %atomicrmw.start ; RV32-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NO-ATOMIC-NEXT: mv a2, a1 -; RV32-NO-ATOMIC-NEXT: bgtz a1, .LBB23_1 +; RV32-NO-ATOMIC-NEXT: mv a2, a0 +; RV32-NO-ATOMIC-NEXT: bgtz a0, .LBB23_1 ; RV32-NO-ATOMIC-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NO-ATOMIC-NEXT: # in Loop: Header=BB23_2 Depth=1 ; RV32-NO-ATOMIC-NEXT: li a2, 1 ; RV32-NO-ATOMIC-NEXT: j .LBB23_1 ; RV32-NO-ATOMIC-NEXT: .LBB23_4: # %atomicrmw.end -; RV32-NO-ATOMIC-NEXT: mv a0, a1 ; RV32-NO-ATOMIC-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NO-ATOMIC-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-NO-ATOMIC-NEXT: addi sp, sp, 16 @@ -1410,29 +1410,29 @@ define i32 @rmw32_max_seq_cst(ptr %p) nounwind { ; RV64-NO-ATOMIC-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: mv s0, a0 -; RV64-NO-ATOMIC-NEXT: lw a1, 0(a0) +; RV64-NO-ATOMIC-NEXT: lw a0, 0(a0) ; RV64-NO-ATOMIC-NEXT: j .LBB23_2 ; RV64-NO-ATOMIC-NEXT: .LBB23_1: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB23_2 Depth=1 -; RV64-NO-ATOMIC-NEXT: sw a1, 12(sp) +; RV64-NO-ATOMIC-NEXT: sw a0, 12(sp) ; RV64-NO-ATOMIC-NEXT: addi a1, sp, 12 ; RV64-NO-ATOMIC-NEXT: li a3, 5 ; RV64-NO-ATOMIC-NEXT: li a4, 5 ; RV64-NO-ATOMIC-NEXT: mv a0, s0 ; RV64-NO-ATOMIC-NEXT: call __atomic_compare_exchange_4 -; RV64-NO-ATOMIC-NEXT: lw a1, 12(sp) -; RV64-NO-ATOMIC-NEXT: bnez a0, .LBB23_4 +; RV64-NO-ATOMIC-NEXT: mv a1, a0 +; RV64-NO-ATOMIC-NEXT: lw a0, 12(sp) +; RV64-NO-ATOMIC-NEXT: bnez a1, .LBB23_4 ; RV64-NO-ATOMIC-NEXT: .LBB23_2: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64-NO-ATOMIC-NEXT: li a0, 1 -; RV64-NO-ATOMIC-NEXT: mv a2, a1 -; RV64-NO-ATOMIC-NEXT: blt a0, a1, .LBB23_1 +; RV64-NO-ATOMIC-NEXT: li a1, 1 +; RV64-NO-ATOMIC-NEXT: mv a2, a0 +; RV64-NO-ATOMIC-NEXT: blt a1, a0, .LBB23_1 ; RV64-NO-ATOMIC-NEXT: # %bb.3: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB23_2 Depth=1 ; RV64-NO-ATOMIC-NEXT: li a2, 1 ; RV64-NO-ATOMIC-NEXT: j .LBB23_1 ; RV64-NO-ATOMIC-NEXT: .LBB23_4: # %atomicrmw.end -; RV64-NO-ATOMIC-NEXT: mv a0, a1 ; RV64-NO-ATOMIC-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: addi sp, sp, 32 @@ -1469,29 +1469,29 @@ define i32 @rmw32_min_seq_cst(ptr %p) nounwind { ; RV32-NO-ATOMIC-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NO-ATOMIC-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32-NO-ATOMIC-NEXT: mv s0, a0 -; RV32-NO-ATOMIC-NEXT: lw a1, 0(a0) +; RV32-NO-ATOMIC-NEXT: lw a0, 0(a0) ; RV32-NO-ATOMIC-NEXT: li s1, 2 ; RV32-NO-ATOMIC-NEXT: j .LBB24_2 ; RV32-NO-ATOMIC-NEXT: .LBB24_1: # %atomicrmw.start ; RV32-NO-ATOMIC-NEXT: # in Loop: Header=BB24_2 Depth=1 -; RV32-NO-ATOMIC-NEXT: sw a1, 0(sp) +; RV32-NO-ATOMIC-NEXT: sw a0, 0(sp) ; RV32-NO-ATOMIC-NEXT: mv a1, sp ; RV32-NO-ATOMIC-NEXT: li a3, 5 ; RV32-NO-ATOMIC-NEXT: li a4, 5 ; RV32-NO-ATOMIC-NEXT: mv a0, s0 ; RV32-NO-ATOMIC-NEXT: call __atomic_compare_exchange_4 -; RV32-NO-ATOMIC-NEXT: lw a1, 0(sp) -; RV32-NO-ATOMIC-NEXT: bnez a0, .LBB24_4 +; RV32-NO-ATOMIC-NEXT: mv a1, a0 +; RV32-NO-ATOMIC-NEXT: lw a0, 0(sp) +; RV32-NO-ATOMIC-NEXT: bnez a1, .LBB24_4 ; RV32-NO-ATOMIC-NEXT: .LBB24_2: # %atomicrmw.start ; RV32-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NO-ATOMIC-NEXT: mv a2, a1 -; RV32-NO-ATOMIC-NEXT: blt a1, s1, .LBB24_1 +; RV32-NO-ATOMIC-NEXT: mv a2, a0 +; RV32-NO-ATOMIC-NEXT: blt a0, s1, .LBB24_1 ; RV32-NO-ATOMIC-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NO-ATOMIC-NEXT: # in Loop: Header=BB24_2 Depth=1 ; RV32-NO-ATOMIC-NEXT: li a2, 1 ; RV32-NO-ATOMIC-NEXT: j .LBB24_1 ; RV32-NO-ATOMIC-NEXT: .LBB24_4: # %atomicrmw.end -; RV32-NO-ATOMIC-NEXT: mv a0, a1 ; RV32-NO-ATOMIC-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NO-ATOMIC-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-NO-ATOMIC-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1525,29 +1525,29 @@ define i32 @rmw32_min_seq_cst(ptr %p) nounwind { ; RV64-NO-ATOMIC-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: mv s0, a0 -; RV64-NO-ATOMIC-NEXT: lw a1, 0(a0) +; RV64-NO-ATOMIC-NEXT: lw a0, 0(a0) ; RV64-NO-ATOMIC-NEXT: li s1, 2 ; RV64-NO-ATOMIC-NEXT: j .LBB24_2 ; RV64-NO-ATOMIC-NEXT: .LBB24_1: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB24_2 Depth=1 -; RV64-NO-ATOMIC-NEXT: sw a1, 4(sp) +; RV64-NO-ATOMIC-NEXT: sw a0, 4(sp) ; RV64-NO-ATOMIC-NEXT: addi a1, sp, 4 ; RV64-NO-ATOMIC-NEXT: li a3, 5 ; RV64-NO-ATOMIC-NEXT: li a4, 5 ; RV64-NO-ATOMIC-NEXT: mv a0, s0 ; RV64-NO-ATOMIC-NEXT: call __atomic_compare_exchange_4 -; RV64-NO-ATOMIC-NEXT: lw a1, 4(sp) -; RV64-NO-ATOMIC-NEXT: bnez a0, .LBB24_4 +; RV64-NO-ATOMIC-NEXT: mv a1, a0 +; RV64-NO-ATOMIC-NEXT: lw a0, 4(sp) +; RV64-NO-ATOMIC-NEXT: bnez a1, .LBB24_4 ; RV64-NO-ATOMIC-NEXT: .LBB24_2: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64-NO-ATOMIC-NEXT: mv a2, a1 -; RV64-NO-ATOMIC-NEXT: blt a1, s1, .LBB24_1 +; RV64-NO-ATOMIC-NEXT: mv a2, a0 +; RV64-NO-ATOMIC-NEXT: blt a0, s1, .LBB24_1 ; RV64-NO-ATOMIC-NEXT: # %bb.3: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB24_2 Depth=1 ; RV64-NO-ATOMIC-NEXT: li a2, 1 ; RV64-NO-ATOMIC-NEXT: j .LBB24_1 ; RV64-NO-ATOMIC-NEXT: .LBB24_4: # %atomicrmw.end -; RV64-NO-ATOMIC-NEXT: mv a0, a1 ; RV64-NO-ATOMIC-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -1584,21 +1584,21 @@ define i32 @rmw32_umax_seq_cst(ptr %p) nounwind { ; RV32-NO-ATOMIC-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NO-ATOMIC-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NO-ATOMIC-NEXT: mv s0, a0 -; RV32-NO-ATOMIC-NEXT: lw a1, 0(a0) +; RV32-NO-ATOMIC-NEXT: lw a0, 0(a0) ; RV32-NO-ATOMIC-NEXT: .LBB25_1: # %atomicrmw.start ; RV32-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NO-ATOMIC-NEXT: seqz a2, a1 -; RV32-NO-ATOMIC-NEXT: add a2, a1, a2 -; RV32-NO-ATOMIC-NEXT: sw a1, 4(sp) +; RV32-NO-ATOMIC-NEXT: seqz a2, a0 +; RV32-NO-ATOMIC-NEXT: add a2, a0, a2 +; RV32-NO-ATOMIC-NEXT: sw a0, 4(sp) ; RV32-NO-ATOMIC-NEXT: addi a1, sp, 4 ; RV32-NO-ATOMIC-NEXT: li a3, 5 ; RV32-NO-ATOMIC-NEXT: li a4, 5 ; RV32-NO-ATOMIC-NEXT: mv a0, s0 ; RV32-NO-ATOMIC-NEXT: call __atomic_compare_exchange_4 -; RV32-NO-ATOMIC-NEXT: lw a1, 4(sp) -; RV32-NO-ATOMIC-NEXT: beqz a0, .LBB25_1 +; RV32-NO-ATOMIC-NEXT: mv a1, a0 +; RV32-NO-ATOMIC-NEXT: lw a0, 4(sp) +; RV32-NO-ATOMIC-NEXT: beqz a1, .LBB25_1 ; RV32-NO-ATOMIC-NEXT: # %bb.2: # %atomicrmw.end -; RV32-NO-ATOMIC-NEXT: mv a0, a1 ; RV32-NO-ATOMIC-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NO-ATOMIC-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-NO-ATOMIC-NEXT: addi sp, sp, 16 @@ -1630,29 +1630,29 @@ define i32 @rmw32_umax_seq_cst(ptr %p) nounwind { ; RV64-NO-ATOMIC-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: mv s0, a0 -; RV64-NO-ATOMIC-NEXT: lw a1, 0(a0) +; RV64-NO-ATOMIC-NEXT: lw a0, 0(a0) ; RV64-NO-ATOMIC-NEXT: j .LBB25_2 ; RV64-NO-ATOMIC-NEXT: .LBB25_1: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB25_2 Depth=1 -; RV64-NO-ATOMIC-NEXT: sw a1, 12(sp) +; RV64-NO-ATOMIC-NEXT: sw a0, 12(sp) ; RV64-NO-ATOMIC-NEXT: addi a1, sp, 12 ; RV64-NO-ATOMIC-NEXT: li a3, 5 ; RV64-NO-ATOMIC-NEXT: li a4, 5 ; RV64-NO-ATOMIC-NEXT: mv a0, s0 ; RV64-NO-ATOMIC-NEXT: call __atomic_compare_exchange_4 -; RV64-NO-ATOMIC-NEXT: lw a1, 12(sp) -; RV64-NO-ATOMIC-NEXT: bnez a0, .LBB25_4 +; RV64-NO-ATOMIC-NEXT: mv a1, a0 +; RV64-NO-ATOMIC-NEXT: lw a0, 12(sp) +; RV64-NO-ATOMIC-NEXT: bnez a1, .LBB25_4 ; RV64-NO-ATOMIC-NEXT: .LBB25_2: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64-NO-ATOMIC-NEXT: li a0, 1 -; RV64-NO-ATOMIC-NEXT: mv a2, a1 -; RV64-NO-ATOMIC-NEXT: bltu a0, a1, .LBB25_1 +; RV64-NO-ATOMIC-NEXT: li a1, 1 +; RV64-NO-ATOMIC-NEXT: mv a2, a0 +; RV64-NO-ATOMIC-NEXT: bltu a1, a0, .LBB25_1 ; RV64-NO-ATOMIC-NEXT: # %bb.3: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB25_2 Depth=1 ; RV64-NO-ATOMIC-NEXT: li a2, 1 ; RV64-NO-ATOMIC-NEXT: j .LBB25_1 ; RV64-NO-ATOMIC-NEXT: .LBB25_4: # %atomicrmw.end -; RV64-NO-ATOMIC-NEXT: mv a0, a1 ; RV64-NO-ATOMIC-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: addi sp, sp, 32 @@ -1689,29 +1689,29 @@ define i32 @rmw32_umin_seq_cst(ptr %p) nounwind { ; RV32-NO-ATOMIC-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NO-ATOMIC-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32-NO-ATOMIC-NEXT: mv s0, a0 -; RV32-NO-ATOMIC-NEXT: lw a1, 0(a0) +; RV32-NO-ATOMIC-NEXT: lw a0, 0(a0) ; RV32-NO-ATOMIC-NEXT: li s1, 2 ; RV32-NO-ATOMIC-NEXT: j .LBB26_2 ; RV32-NO-ATOMIC-NEXT: .LBB26_1: # %atomicrmw.start ; RV32-NO-ATOMIC-NEXT: # in Loop: Header=BB26_2 Depth=1 -; RV32-NO-ATOMIC-NEXT: sw a1, 0(sp) +; RV32-NO-ATOMIC-NEXT: sw a0, 0(sp) ; RV32-NO-ATOMIC-NEXT: mv a1, sp ; RV32-NO-ATOMIC-NEXT: li a3, 5 ; RV32-NO-ATOMIC-NEXT: li a4, 5 ; RV32-NO-ATOMIC-NEXT: mv a0, s0 ; RV32-NO-ATOMIC-NEXT: call __atomic_compare_exchange_4 -; RV32-NO-ATOMIC-NEXT: lw a1, 0(sp) -; RV32-NO-ATOMIC-NEXT: bnez a0, .LBB26_4 +; RV32-NO-ATOMIC-NEXT: mv a1, a0 +; RV32-NO-ATOMIC-NEXT: lw a0, 0(sp) +; RV32-NO-ATOMIC-NEXT: bnez a1, .LBB26_4 ; RV32-NO-ATOMIC-NEXT: .LBB26_2: # %atomicrmw.start ; RV32-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NO-ATOMIC-NEXT: mv a2, a1 -; RV32-NO-ATOMIC-NEXT: bltu a1, s1, .LBB26_1 +; RV32-NO-ATOMIC-NEXT: mv a2, a0 +; RV32-NO-ATOMIC-NEXT: bltu a0, s1, .LBB26_1 ; RV32-NO-ATOMIC-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NO-ATOMIC-NEXT: # in Loop: Header=BB26_2 Depth=1 ; RV32-NO-ATOMIC-NEXT: li a2, 1 ; RV32-NO-ATOMIC-NEXT: j .LBB26_1 ; RV32-NO-ATOMIC-NEXT: .LBB26_4: # %atomicrmw.end -; RV32-NO-ATOMIC-NEXT: mv a0, a1 ; RV32-NO-ATOMIC-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NO-ATOMIC-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-NO-ATOMIC-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -1745,29 +1745,29 @@ define i32 @rmw32_umin_seq_cst(ptr %p) nounwind { ; RV64-NO-ATOMIC-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: mv s0, a0 -; RV64-NO-ATOMIC-NEXT: lw a1, 0(a0) +; RV64-NO-ATOMIC-NEXT: lw a0, 0(a0) ; RV64-NO-ATOMIC-NEXT: li s1, 2 ; RV64-NO-ATOMIC-NEXT: j .LBB26_2 ; RV64-NO-ATOMIC-NEXT: .LBB26_1: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB26_2 Depth=1 -; RV64-NO-ATOMIC-NEXT: sw a1, 4(sp) +; RV64-NO-ATOMIC-NEXT: sw a0, 4(sp) ; RV64-NO-ATOMIC-NEXT: addi a1, sp, 4 ; RV64-NO-ATOMIC-NEXT: li a3, 5 ; RV64-NO-ATOMIC-NEXT: li a4, 5 ; RV64-NO-ATOMIC-NEXT: mv a0, s0 ; RV64-NO-ATOMIC-NEXT: call __atomic_compare_exchange_4 -; RV64-NO-ATOMIC-NEXT: lw a1, 4(sp) -; RV64-NO-ATOMIC-NEXT: bnez a0, .LBB26_4 +; RV64-NO-ATOMIC-NEXT: mv a1, a0 +; RV64-NO-ATOMIC-NEXT: lw a0, 4(sp) +; RV64-NO-ATOMIC-NEXT: bnez a1, .LBB26_4 ; RV64-NO-ATOMIC-NEXT: .LBB26_2: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64-NO-ATOMIC-NEXT: mv a2, a1 -; RV64-NO-ATOMIC-NEXT: bltu a1, s1, .LBB26_1 +; RV64-NO-ATOMIC-NEXT: mv a2, a0 +; RV64-NO-ATOMIC-NEXT: bltu a0, s1, .LBB26_1 ; RV64-NO-ATOMIC-NEXT: # %bb.3: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB26_2 Depth=1 ; RV64-NO-ATOMIC-NEXT: li a2, 1 ; RV64-NO-ATOMIC-NEXT: j .LBB26_1 ; RV64-NO-ATOMIC-NEXT: .LBB26_4: # %atomicrmw.end -; RV64-NO-ATOMIC-NEXT: mv a0, a1 ; RV64-NO-ATOMIC-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -3348,43 +3348,43 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a4, 0(a0) -; RV32-NEXT: lw a1, 4(a0) +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lw a1, 4(s0) ; RV32-NEXT: j .LBB49_2 ; RV32-NEXT: .LBB49_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV32-NEXT: neg a3, a0 +; RV32-NEXT: neg a3, a3 ; RV32-NEXT: and a3, a3, a1 -; RV32-NEXT: sw a4, 0(sp) +; RV32-NEXT: sw a0, 0(sp) ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: mv a1, sp ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8 -; RV32-NEXT: lw a4, 0(sp) +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: lw a0, 0(sp) ; RV32-NEXT: lw a1, 4(sp) -; RV32-NEXT: bnez a0, .LBB49_6 +; RV32-NEXT: bnez a2, .LBB49_6 ; RV32-NEXT: .LBB49_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: beqz a1, .LBB49_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV32-NEXT: sgtz a0, a1 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: bnez a0, .LBB49_1 +; RV32-NEXT: sgtz a3, a1 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bnez a3, .LBB49_1 ; RV32-NEXT: j .LBB49_5 ; RV32-NEXT: .LBB49_4: # in Loop: Header=BB49_2 Depth=1 -; RV32-NEXT: sltiu a0, a4, 2 -; RV32-NEXT: xori a0, a0, 1 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: bnez a0, .LBB49_1 +; RV32-NEXT: sltiu a2, a0, 2 +; RV32-NEXT: xori a3, a2, 1 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bnez a3, .LBB49_1 ; RV32-NEXT: .LBB49_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB49_2 Depth=1 ; RV32-NEXT: li a2, 1 ; RV32-NEXT: j .LBB49_1 ; RV32-NEXT: .LBB49_6: # %atomicrmw.end -; RV32-NEXT: mv a0, a4 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 @@ -3396,28 +3396,28 @@ define i64 @rmw64_max_seq_cst(ptr %p) nounwind { ; RV64-NO-ATOMIC-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: mv s0, a0 -; RV64-NO-ATOMIC-NEXT: ld a1, 0(a0) +; RV64-NO-ATOMIC-NEXT: ld a0, 0(a0) ; RV64-NO-ATOMIC-NEXT: j .LBB49_2 ; RV64-NO-ATOMIC-NEXT: .LBB49_1: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB49_2 Depth=1 -; RV64-NO-ATOMIC-NEXT: sd a1, 8(sp) +; RV64-NO-ATOMIC-NEXT: sd a0, 8(sp) ; RV64-NO-ATOMIC-NEXT: addi a1, sp, 8 ; RV64-NO-ATOMIC-NEXT: li a3, 5 ; RV64-NO-ATOMIC-NEXT: li a4, 5 ; RV64-NO-ATOMIC-NEXT: mv a0, s0 ; RV64-NO-ATOMIC-NEXT: call __atomic_compare_exchange_8 -; RV64-NO-ATOMIC-NEXT: ld a1, 8(sp) -; RV64-NO-ATOMIC-NEXT: bnez a0, .LBB49_4 +; RV64-NO-ATOMIC-NEXT: mv a1, a0 +; RV64-NO-ATOMIC-NEXT: ld a0, 8(sp) +; RV64-NO-ATOMIC-NEXT: bnez a1, .LBB49_4 ; RV64-NO-ATOMIC-NEXT: .LBB49_2: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64-NO-ATOMIC-NEXT: mv a2, a1 -; RV64-NO-ATOMIC-NEXT: bgtz a1, .LBB49_1 +; RV64-NO-ATOMIC-NEXT: mv a2, a0 +; RV64-NO-ATOMIC-NEXT: bgtz a0, .LBB49_1 ; RV64-NO-ATOMIC-NEXT: # %bb.3: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB49_2 Depth=1 ; RV64-NO-ATOMIC-NEXT: li a2, 1 ; RV64-NO-ATOMIC-NEXT: j .LBB49_1 ; RV64-NO-ATOMIC-NEXT: .LBB49_4: # %atomicrmw.end -; RV64-NO-ATOMIC-NEXT: mv a0, a1 ; RV64-NO-ATOMIC-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: addi sp, sp, 32 @@ -3453,42 +3453,42 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a4, 0(a0) -; RV32-NEXT: lw a1, 4(a0) +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lw a1, 4(s0) ; RV32-NEXT: j .LBB50_2 ; RV32-NEXT: .LBB50_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV32-NEXT: neg a3, a0 +; RV32-NEXT: neg a3, a3 ; RV32-NEXT: and a3, a3, a1 -; RV32-NEXT: sw a4, 0(sp) +; RV32-NEXT: sw a0, 0(sp) ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: mv a1, sp ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8 -; RV32-NEXT: lw a4, 0(sp) +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: lw a0, 0(sp) ; RV32-NEXT: lw a1, 4(sp) -; RV32-NEXT: bnez a0, .LBB50_6 +; RV32-NEXT: bnez a2, .LBB50_6 ; RV32-NEXT: .LBB50_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: beqz a1, .LBB50_4 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV32-NEXT: slti a0, a1, 0 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: bnez a0, .LBB50_1 +; RV32-NEXT: slti a3, a1, 0 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bnez a3, .LBB50_1 ; RV32-NEXT: j .LBB50_5 ; RV32-NEXT: .LBB50_4: # in Loop: Header=BB50_2 Depth=1 -; RV32-NEXT: sltiu a0, a4, 2 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: bnez a0, .LBB50_1 +; RV32-NEXT: sltiu a3, a0, 2 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bnez a3, .LBB50_1 ; RV32-NEXT: .LBB50_5: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB50_2 Depth=1 ; RV32-NEXT: li a2, 1 ; RV32-NEXT: j .LBB50_1 ; RV32-NEXT: .LBB50_6: # %atomicrmw.end -; RV32-NEXT: mv a0, a4 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 @@ -3501,29 +3501,29 @@ define i64 @rmw64_min_seq_cst(ptr %p) nounwind { ; RV64-NO-ATOMIC-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: mv s0, a0 -; RV64-NO-ATOMIC-NEXT: ld a1, 0(a0) +; RV64-NO-ATOMIC-NEXT: ld a0, 0(a0) ; RV64-NO-ATOMIC-NEXT: li s1, 2 ; RV64-NO-ATOMIC-NEXT: j .LBB50_2 ; RV64-NO-ATOMIC-NEXT: .LBB50_1: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB50_2 Depth=1 -; RV64-NO-ATOMIC-NEXT: sd a1, 0(sp) +; RV64-NO-ATOMIC-NEXT: sd a0, 0(sp) ; RV64-NO-ATOMIC-NEXT: mv a1, sp ; RV64-NO-ATOMIC-NEXT: li a3, 5 ; RV64-NO-ATOMIC-NEXT: li a4, 5 ; RV64-NO-ATOMIC-NEXT: mv a0, s0 ; RV64-NO-ATOMIC-NEXT: call __atomic_compare_exchange_8 -; RV64-NO-ATOMIC-NEXT: ld a1, 0(sp) -; RV64-NO-ATOMIC-NEXT: bnez a0, .LBB50_4 +; RV64-NO-ATOMIC-NEXT: mv a1, a0 +; RV64-NO-ATOMIC-NEXT: ld a0, 0(sp) +; RV64-NO-ATOMIC-NEXT: bnez a1, .LBB50_4 ; RV64-NO-ATOMIC-NEXT: .LBB50_2: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64-NO-ATOMIC-NEXT: mv a2, a1 -; RV64-NO-ATOMIC-NEXT: blt a1, s1, .LBB50_1 +; RV64-NO-ATOMIC-NEXT: mv a2, a0 +; RV64-NO-ATOMIC-NEXT: blt a0, s1, .LBB50_1 ; RV64-NO-ATOMIC-NEXT: # %bb.3: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB50_2 Depth=1 ; RV64-NO-ATOMIC-NEXT: li a2, 1 ; RV64-NO-ATOMIC-NEXT: j .LBB50_1 ; RV64-NO-ATOMIC-NEXT: .LBB50_4: # %atomicrmw.end -; RV64-NO-ATOMIC-NEXT: mv a0, a1 ; RV64-NO-ATOMIC-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -3560,37 +3560,37 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a4, 0(a0) -; RV32-NEXT: lw a1, 4(a0) +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lw a1, 4(s0) ; RV32-NEXT: j .LBB51_2 ; RV32-NEXT: .LBB51_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB51_2 Depth=1 -; RV32-NEXT: neg a3, a0 +; RV32-NEXT: neg a3, a3 ; RV32-NEXT: and a3, a3, a1 -; RV32-NEXT: sw a4, 0(sp) +; RV32-NEXT: sw a0, 0(sp) ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: mv a1, sp ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8 -; RV32-NEXT: lw a4, 0(sp) +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: lw a0, 0(sp) ; RV32-NEXT: lw a1, 4(sp) -; RV32-NEXT: bnez a0, .LBB51_4 +; RV32-NEXT: bnez a2, .LBB51_4 ; RV32-NEXT: .LBB51_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: snez a0, a1 -; RV32-NEXT: sltiu a2, a4, 2 -; RV32-NEXT: xori a2, a2, 1 -; RV32-NEXT: or a0, a2, a0 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: bnez a0, .LBB51_1 +; RV32-NEXT: snez a2, a1 +; RV32-NEXT: sltiu a3, a0, 2 +; RV32-NEXT: xori a3, a3, 1 +; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bnez a3, .LBB51_1 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB51_2 Depth=1 ; RV32-NEXT: li a2, 1 ; RV32-NEXT: j .LBB51_1 ; RV32-NEXT: .LBB51_4: # %atomicrmw.end -; RV32-NEXT: mv a0, a4 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 @@ -3602,21 +3602,21 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind { ; RV64-NO-ATOMIC-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: mv s0, a0 -; RV64-NO-ATOMIC-NEXT: ld a1, 0(a0) +; RV64-NO-ATOMIC-NEXT: ld a0, 0(a0) ; RV64-NO-ATOMIC-NEXT: .LBB51_1: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64-NO-ATOMIC-NEXT: seqz a2, a1 -; RV64-NO-ATOMIC-NEXT: add a2, a1, a2 -; RV64-NO-ATOMIC-NEXT: sd a1, 8(sp) +; RV64-NO-ATOMIC-NEXT: seqz a2, a0 +; RV64-NO-ATOMIC-NEXT: add a2, a0, a2 +; RV64-NO-ATOMIC-NEXT: sd a0, 8(sp) ; RV64-NO-ATOMIC-NEXT: addi a1, sp, 8 ; RV64-NO-ATOMIC-NEXT: li a3, 5 ; RV64-NO-ATOMIC-NEXT: li a4, 5 ; RV64-NO-ATOMIC-NEXT: mv a0, s0 ; RV64-NO-ATOMIC-NEXT: call __atomic_compare_exchange_8 -; RV64-NO-ATOMIC-NEXT: ld a1, 8(sp) -; RV64-NO-ATOMIC-NEXT: beqz a0, .LBB51_1 +; RV64-NO-ATOMIC-NEXT: mv a1, a0 +; RV64-NO-ATOMIC-NEXT: ld a0, 8(sp) +; RV64-NO-ATOMIC-NEXT: beqz a1, .LBB51_1 ; RV64-NO-ATOMIC-NEXT: # %bb.2: # %atomicrmw.end -; RV64-NO-ATOMIC-NEXT: mv a0, a1 ; RV64-NO-ATOMIC-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: addi sp, sp, 32 @@ -3652,36 +3652,36 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind { ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lw a4, 0(a0) -; RV32-NEXT: lw a1, 4(a0) +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lw a1, 4(s0) ; RV32-NEXT: j .LBB52_2 ; RV32-NEXT: .LBB52_1: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB52_2 Depth=1 -; RV32-NEXT: neg a3, a0 +; RV32-NEXT: neg a3, a3 ; RV32-NEXT: and a3, a3, a1 -; RV32-NEXT: sw a4, 0(sp) +; RV32-NEXT: sw a0, 0(sp) ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: mv a1, sp ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 ; RV32-NEXT: mv a0, s0 ; RV32-NEXT: call __atomic_compare_exchange_8 -; RV32-NEXT: lw a4, 0(sp) +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: lw a0, 0(sp) ; RV32-NEXT: lw a1, 4(sp) -; RV32-NEXT: bnez a0, .LBB52_4 +; RV32-NEXT: bnez a2, .LBB52_4 ; RV32-NEXT: .LBB52_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: sltiu a0, a4, 2 -; RV32-NEXT: seqz a2, a1 -; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: mv a2, a4 -; RV32-NEXT: bnez a0, .LBB52_1 +; RV32-NEXT: sltiu a2, a0, 2 +; RV32-NEXT: seqz a3, a1 +; RV32-NEXT: and a3, a3, a2 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bnez a3, .LBB52_1 ; RV32-NEXT: # %bb.3: # %atomicrmw.start ; RV32-NEXT: # in Loop: Header=BB52_2 Depth=1 ; RV32-NEXT: li a2, 1 ; RV32-NEXT: j .LBB52_1 ; RV32-NEXT: .LBB52_4: # %atomicrmw.end -; RV32-NEXT: mv a0, a4 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 @@ -3694,29 +3694,29 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind { ; RV64-NO-ATOMIC-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64-NO-ATOMIC-NEXT: mv s0, a0 -; RV64-NO-ATOMIC-NEXT: ld a1, 0(a0) +; RV64-NO-ATOMIC-NEXT: ld a0, 0(a0) ; RV64-NO-ATOMIC-NEXT: li s1, 2 ; RV64-NO-ATOMIC-NEXT: j .LBB52_2 ; RV64-NO-ATOMIC-NEXT: .LBB52_1: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB52_2 Depth=1 -; RV64-NO-ATOMIC-NEXT: sd a1, 0(sp) +; RV64-NO-ATOMIC-NEXT: sd a0, 0(sp) ; RV64-NO-ATOMIC-NEXT: mv a1, sp ; RV64-NO-ATOMIC-NEXT: li a3, 5 ; RV64-NO-ATOMIC-NEXT: li a4, 5 ; RV64-NO-ATOMIC-NEXT: mv a0, s0 ; RV64-NO-ATOMIC-NEXT: call __atomic_compare_exchange_8 -; RV64-NO-ATOMIC-NEXT: ld a1, 0(sp) -; RV64-NO-ATOMIC-NEXT: bnez a0, .LBB52_4 +; RV64-NO-ATOMIC-NEXT: mv a1, a0 +; RV64-NO-ATOMIC-NEXT: ld a0, 0(sp) +; RV64-NO-ATOMIC-NEXT: bnez a1, .LBB52_4 ; RV64-NO-ATOMIC-NEXT: .LBB52_2: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64-NO-ATOMIC-NEXT: mv a2, a1 -; RV64-NO-ATOMIC-NEXT: bltu a1, s1, .LBB52_1 +; RV64-NO-ATOMIC-NEXT: mv a2, a0 +; RV64-NO-ATOMIC-NEXT: bltu a0, s1, .LBB52_1 ; RV64-NO-ATOMIC-NEXT: # %bb.3: # %atomicrmw.start ; RV64-NO-ATOMIC-NEXT: # in Loop: Header=BB52_2 Depth=1 ; RV64-NO-ATOMIC-NEXT: li a2, 1 ; RV64-NO-ATOMIC-NEXT: j .LBB52_1 ; RV64-NO-ATOMIC-NEXT: .LBB52_4: # %atomicrmw.end -; RV64-NO-ATOMIC-NEXT: mv a0, a1 ; RV64-NO-ATOMIC-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64-NO-ATOMIC-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -4530,12 +4530,12 @@ define i128 @rmw128(ptr %p) nounwind { ; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: mv s0, a1 +; RV32-NEXT: mv s1, a1 +; RV32-NEXT: mv s0, a0 ; RV32-NEXT: lw a4, 0(a1) ; RV32-NEXT: lw a3, 4(a1) ; RV32-NEXT: lw a1, 8(a1) -; RV32-NEXT: lw a2, 12(s0) -; RV32-NEXT: mv s1, a0 +; RV32-NEXT: lw a2, 12(s1) ; RV32-NEXT: .LBB62_1: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: addi a0, a4, 1 @@ -4559,7 +4559,7 @@ define i128 @rmw128(ptr %p) nounwind { ; RV32-NEXT: mv a3, sp ; RV32-NEXT: li a4, 5 ; RV32-NEXT: li a5, 5 -; RV32-NEXT: mv a1, s0 +; RV32-NEXT: mv a1, s1 ; RV32-NEXT: call __atomic_compare_exchange ; RV32-NEXT: lw a4, 16(sp) ; RV32-NEXT: lw a3, 20(sp) @@ -4567,10 +4567,10 @@ define i128 @rmw128(ptr %p) nounwind { ; RV32-NEXT: lw a2, 28(sp) ; RV32-NEXT: beqz a0, .LBB62_1 ; RV32-NEXT: # %bb.2: # %atomicrmw.end -; RV32-NEXT: sw a4, 0(s1) -; RV32-NEXT: sw a3, 4(s1) -; RV32-NEXT: sw a1, 8(s1) -; RV32-NEXT: sw a2, 12(s1) +; RV32-NEXT: sw a4, 0(s0) +; RV32-NEXT: sw a3, 4(s0) +; RV32-NEXT: sw a1, 8(s0) +; RV32-NEXT: sw a2, 12(s0) ; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/fp-fcanonicalize.ll b/llvm/test/CodeGen/RISCV/fp-fcanonicalize.ll index e9b771a0698de..7da9bbbb079e9 100644 --- a/llvm/test/CodeGen/RISCV/fp-fcanonicalize.ll +++ b/llvm/test/CodeGen/RISCV/fp-fcanonicalize.ll @@ -306,12 +306,12 @@ define <4 x half> @fcanonicalize_v4f16(<4 x half> %x) { ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs0, -48 ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs1, -56 ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs2, -64 +; CHECK-NOFP16-RV64-NEXT: mv s0, a0 ; CHECK-NOFP16-RV64-NEXT: lhu s1, 0(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s2, 8(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s3, 16(a1) -; CHECK-NOFP16-RV64-NEXT: lhu a1, 24(a1) -; CHECK-NOFP16-RV64-NEXT: mv s0, a0 -; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, a1 +; CHECK-NOFP16-RV64-NEXT: lhu a0, 24(a1) +; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, a0 ; CHECK-NOFP16-RV64-NEXT: call __extendhfsf2 ; CHECK-NOFP16-RV64-NEXT: fmv.s fs0, fa0 ; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, s3 @@ -330,8 +330,8 @@ define <4 x half> @fcanonicalize_v4f16(<4 x half> %x) { ; CHECK-NOFP16-RV64-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s1, fa0 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s2, fs2 -; CHECK-NOFP16-RV64-NEXT: fmin.s fa0, fs0, fs0 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s3, fs1 +; CHECK-NOFP16-RV64-NEXT: fmin.s fa0, fs0, fs0 ; CHECK-NOFP16-RV64-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w a0, fa0 ; CHECK-NOFP16-RV64-NEXT: sh s1, 0(s0) @@ -419,8 +419,8 @@ define <4 x half> @fcanonicalize_v4f16(<4 x half> %x) { ; CHECK-NOFP16-RV32-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s1, fa0 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s2, fs1 -; CHECK-NOFP16-RV32-NEXT: fmin.s fa0, fs3, fs3 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s3, fs2 +; CHECK-NOFP16-RV32-NEXT: fmin.s fa0, fs3, fs3 ; CHECK-NOFP16-RV32-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w a0, fa0 ; CHECK-NOFP16-RV32-NEXT: sh s1, 0(s0) @@ -485,12 +485,12 @@ define <4 x half> @fcanonicalize_v4f16_nnan(<4 x half> %x) { ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs0, -48 ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs1, -56 ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs2, -64 +; CHECK-NOFP16-RV64-NEXT: mv s0, a0 ; CHECK-NOFP16-RV64-NEXT: lhu s1, 0(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s2, 8(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s3, 16(a1) -; CHECK-NOFP16-RV64-NEXT: lhu a1, 24(a1) -; CHECK-NOFP16-RV64-NEXT: mv s0, a0 -; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, a1 +; CHECK-NOFP16-RV64-NEXT: lhu a0, 24(a1) +; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, a0 ; CHECK-NOFP16-RV64-NEXT: call __extendhfsf2 ; CHECK-NOFP16-RV64-NEXT: fmv.s fs0, fa0 ; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, s3 @@ -509,8 +509,8 @@ define <4 x half> @fcanonicalize_v4f16_nnan(<4 x half> %x) { ; CHECK-NOFP16-RV64-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s1, fa0 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s2, fs2 -; CHECK-NOFP16-RV64-NEXT: fmin.s fa0, fs0, fs0 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s3, fs1 +; CHECK-NOFP16-RV64-NEXT: fmin.s fa0, fs0, fs0 ; CHECK-NOFP16-RV64-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w a0, fa0 ; CHECK-NOFP16-RV64-NEXT: sh s1, 0(s0) @@ -598,8 +598,8 @@ define <4 x half> @fcanonicalize_v4f16_nnan(<4 x half> %x) { ; CHECK-NOFP16-RV32-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s1, fa0 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s2, fs1 -; CHECK-NOFP16-RV32-NEXT: fmin.s fa0, fs3, fs3 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s3, fs2 +; CHECK-NOFP16-RV32-NEXT: fmin.s fa0, fs3, fs3 ; CHECK-NOFP16-RV32-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w a0, fa0 ; CHECK-NOFP16-RV32-NEXT: sh s1, 0(s0) @@ -688,6 +688,7 @@ define <8 x half> @fcanonicalize_v8f16(<8 x half> %x) { ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs4, -112 ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs5, -120 ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs6, -128 +; CHECK-NOFP16-RV64-NEXT: mv s0, a0 ; CHECK-NOFP16-RV64-NEXT: lhu s1, 0(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s2, 8(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s3, 16(a1) @@ -695,9 +696,8 @@ define <8 x half> @fcanonicalize_v8f16(<8 x half> %x) { ; CHECK-NOFP16-RV64-NEXT: lhu s5, 32(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s6, 40(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s7, 48(a1) -; CHECK-NOFP16-RV64-NEXT: lhu a1, 56(a1) -; CHECK-NOFP16-RV64-NEXT: mv s0, a0 -; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, a1 +; CHECK-NOFP16-RV64-NEXT: lhu a0, 56(a1) +; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, a0 ; CHECK-NOFP16-RV64-NEXT: call __extendhfsf2 ; CHECK-NOFP16-RV64-NEXT: fmv.s fs0, fa0 ; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, s7 @@ -740,8 +740,8 @@ define <8 x half> @fcanonicalize_v8f16(<8 x half> %x) { ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s4, fs4 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s5, fs3 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s6, fs2 -; CHECK-NOFP16-RV64-NEXT: fmin.s fa0, fs0, fs0 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s7, fs1 +; CHECK-NOFP16-RV64-NEXT: fmin.s fa0, fs0, fs0 ; CHECK-NOFP16-RV64-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w a0, fa0 ; CHECK-NOFP16-RV64-NEXT: sh s5, 8(s0) @@ -905,8 +905,8 @@ define <8 x half> @fcanonicalize_v8f16(<8 x half> %x) { ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s4, fs5 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s5, fs6 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s6, fs7 -; CHECK-NOFP16-RV32-NEXT: fmin.s fa0, fs1, fs1 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s7, fs3 +; CHECK-NOFP16-RV32-NEXT: fmin.s fa0, fs1, fs1 ; CHECK-NOFP16-RV32-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w a0, fa0 ; CHECK-NOFP16-RV32-NEXT: sh s5, 8(s0) @@ -1015,6 +1015,7 @@ define <8 x half> @fcanonicalize_v8f16_nnan(<8 x half> %x) { ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs4, -112 ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs5, -120 ; CHECK-NOFP16-RV64-NEXT: .cfi_offset fs6, -128 +; CHECK-NOFP16-RV64-NEXT: mv s0, a0 ; CHECK-NOFP16-RV64-NEXT: lhu s1, 0(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s2, 8(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s3, 16(a1) @@ -1022,9 +1023,8 @@ define <8 x half> @fcanonicalize_v8f16_nnan(<8 x half> %x) { ; CHECK-NOFP16-RV64-NEXT: lhu s5, 32(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s6, 40(a1) ; CHECK-NOFP16-RV64-NEXT: lhu s7, 48(a1) -; CHECK-NOFP16-RV64-NEXT: lhu a1, 56(a1) -; CHECK-NOFP16-RV64-NEXT: mv s0, a0 -; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, a1 +; CHECK-NOFP16-RV64-NEXT: lhu a0, 56(a1) +; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, a0 ; CHECK-NOFP16-RV64-NEXT: call __extendhfsf2 ; CHECK-NOFP16-RV64-NEXT: fmv.s fs0, fa0 ; CHECK-NOFP16-RV64-NEXT: fmv.w.x fa0, s7 @@ -1067,8 +1067,8 @@ define <8 x half> @fcanonicalize_v8f16_nnan(<8 x half> %x) { ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s4, fs4 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s5, fs3 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s6, fs2 -; CHECK-NOFP16-RV64-NEXT: fmin.s fa0, fs0, fs0 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w s7, fs1 +; CHECK-NOFP16-RV64-NEXT: fmin.s fa0, fs0, fs0 ; CHECK-NOFP16-RV64-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV64-NEXT: fmv.x.w a0, fa0 ; CHECK-NOFP16-RV64-NEXT: sh s5, 8(s0) @@ -1232,8 +1232,8 @@ define <8 x half> @fcanonicalize_v8f16_nnan(<8 x half> %x) { ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s4, fs5 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s5, fs6 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s6, fs7 -; CHECK-NOFP16-RV32-NEXT: fmin.s fa0, fs1, fs1 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w s7, fs3 +; CHECK-NOFP16-RV32-NEXT: fmin.s fa0, fs1, fs1 ; CHECK-NOFP16-RV32-NEXT: call __truncsfhf2 ; CHECK-NOFP16-RV32-NEXT: fmv.x.w a0, fa0 ; CHECK-NOFP16-RV32-NEXT: sh s5, 8(s0) diff --git a/llvm/test/CodeGen/RISCV/fp128.ll b/llvm/test/CodeGen/RISCV/fp128.ll index a8e26f7686e50..443bd22c58a21 100644 --- a/llvm/test/CodeGen/RISCV/fp128.ll +++ b/llvm/test/CodeGen/RISCV/fp128.ll @@ -18,21 +18,21 @@ define i32 @test_load_and_cmp() nounwind { ; RV32I-NEXT: lw a2, %lo(x)(a0) ; RV32I-NEXT: lw a3, %lo(x+4)(a0) ; RV32I-NEXT: lw a4, %lo(x+8)(a0) -; RV32I-NEXT: lw a5, %lo(x+12)(a0) -; RV32I-NEXT: lw a0, %lo(y)(a1) +; RV32I-NEXT: lw a0, %lo(x+12)(a0) +; RV32I-NEXT: lw a5, %lo(y)(a1) ; RV32I-NEXT: lw a6, %lo(y+4)(a1) ; RV32I-NEXT: lw a7, %lo(y+8)(a1) ; RV32I-NEXT: lw a1, %lo(y+12)(a1) -; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: sw a7, 16(sp) ; RV32I-NEXT: sw a1, 20(sp) -; RV32I-NEXT: addi a0, sp, 24 -; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: sw a2, 24(sp) ; RV32I-NEXT: sw a3, 28(sp) ; RV32I-NEXT: sw a4, 32(sp) -; RV32I-NEXT: sw a5, 36(sp) +; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: addi a0, sp, 24 +; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: call __netf2 ; RV32I-NEXT: snez a0, a0 ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload @@ -52,35 +52,35 @@ define i32 @test_add_and_fptosi() nounwind { ; RV32I-NEXT: sw ra, 76(sp) # 4-byte Folded Spill ; RV32I-NEXT: lui a0, %hi(x) ; RV32I-NEXT: lui a1, %hi(y) -; RV32I-NEXT: lw a3, %lo(x)(a0) -; RV32I-NEXT: lw a4, %lo(x+4)(a0) -; RV32I-NEXT: lw a5, %lo(x+8)(a0) -; RV32I-NEXT: lw a6, %lo(x+12)(a0) -; RV32I-NEXT: lw a0, %lo(y)(a1) -; RV32I-NEXT: lw a2, %lo(y+4)(a1) +; RV32I-NEXT: lw a2, %lo(x)(a0) +; RV32I-NEXT: lw a3, %lo(x+4)(a0) +; RV32I-NEXT: lw a4, %lo(x+8)(a0) +; RV32I-NEXT: lw a0, %lo(x+12)(a0) +; RV32I-NEXT: lw a5, %lo(y)(a1) +; RV32I-NEXT: lw a6, %lo(y+4)(a1) ; RV32I-NEXT: lw a7, %lo(y+8)(a1) ; RV32I-NEXT: lw a1, %lo(y+12)(a1) -; RV32I-NEXT: sw a0, 24(sp) -; RV32I-NEXT: sw a2, 28(sp) +; RV32I-NEXT: sw a5, 24(sp) +; RV32I-NEXT: sw a6, 28(sp) ; RV32I-NEXT: sw a7, 32(sp) ; RV32I-NEXT: sw a1, 36(sp) +; RV32I-NEXT: sw a2, 40(sp) +; RV32I-NEXT: sw a3, 44(sp) +; RV32I-NEXT: sw a4, 48(sp) +; RV32I-NEXT: sw a0, 52(sp) ; RV32I-NEXT: addi a0, sp, 56 ; RV32I-NEXT: addi a1, sp, 40 ; RV32I-NEXT: addi a2, sp, 24 -; RV32I-NEXT: sw a3, 40(sp) -; RV32I-NEXT: sw a4, 44(sp) -; RV32I-NEXT: sw a5, 48(sp) -; RV32I-NEXT: sw a6, 52(sp) ; RV32I-NEXT: call __addtf3 -; RV32I-NEXT: lw a1, 56(sp) -; RV32I-NEXT: lw a2, 60(sp) -; RV32I-NEXT: lw a3, 64(sp) -; RV32I-NEXT: lw a4, 68(sp) +; RV32I-NEXT: lw a0, 56(sp) +; RV32I-NEXT: lw a1, 60(sp) +; RV32I-NEXT: lw a2, 64(sp) +; RV32I-NEXT: lw a3, 68(sp) +; RV32I-NEXT: sw a0, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) +; RV32I-NEXT: sw a2, 16(sp) +; RV32I-NEXT: sw a3, 20(sp) ; RV32I-NEXT: addi a0, sp, 8 -; RV32I-NEXT: sw a1, 8(sp) -; RV32I-NEXT: sw a2, 12(sp) -; RV32I-NEXT: sw a3, 16(sp) -; RV32I-NEXT: sw a4, 20(sp) ; RV32I-NEXT: call __fixtfsi ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 80 @@ -101,26 +101,26 @@ define fp128 @fmaximum(fp128 %x, fp128 %y) { ; RV32I-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; RV32I-NEXT: .cfi_offset ra, -4 ; RV32I-NEXT: .cfi_offset s0, -8 -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a5, 8(a1) -; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw a1, 0(a2) -; RV32I-NEXT: lw a7, 4(a2) -; RV32I-NEXT: lw t0, 8(a2) -; RV32I-NEXT: lw a2, 12(a2) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: sw a1, 8(sp) -; RV32I-NEXT: sw a7, 12(sp) -; RV32I-NEXT: sw t0, 16(sp) +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: lw a5, 0(a2) +; RV32I-NEXT: lw a6, 4(a2) +; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a2, 12(a2) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a7, 16(sp) ; RV32I-NEXT: sw a2, 20(sp) +; RV32I-NEXT: sw a0, 24(sp) +; RV32I-NEXT: sw a3, 28(sp) +; RV32I-NEXT: sw a4, 32(sp) +; RV32I-NEXT: sw a1, 36(sp) ; RV32I-NEXT: addi a0, sp, 40 ; RV32I-NEXT: addi a1, sp, 24 ; RV32I-NEXT: addi a2, sp, 8 -; RV32I-NEXT: sw a3, 24(sp) -; RV32I-NEXT: sw a4, 28(sp) -; RV32I-NEXT: sw a5, 32(sp) -; RV32I-NEXT: sw a6, 36(sp) ; RV32I-NEXT: call fmaximuml ; RV32I-NEXT: lw a0, 40(sp) ; RV32I-NEXT: lw a1, 44(sp) @@ -150,26 +150,26 @@ define fp128 @fminimum(fp128 %x, fp128 %y) { ; RV32I-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; RV32I-NEXT: .cfi_offset ra, -4 ; RV32I-NEXT: .cfi_offset s0, -8 -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a5, 8(a1) -; RV32I-NEXT: lw a6, 12(a1) -; RV32I-NEXT: lw a1, 0(a2) -; RV32I-NEXT: lw a7, 4(a2) -; RV32I-NEXT: lw t0, 8(a2) -; RV32I-NEXT: lw a2, 12(a2) ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: sw a1, 8(sp) -; RV32I-NEXT: sw a7, 12(sp) -; RV32I-NEXT: sw t0, 16(sp) +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: lw a5, 0(a2) +; RV32I-NEXT: lw a6, 4(a2) +; RV32I-NEXT: lw a7, 8(a2) +; RV32I-NEXT: lw a2, 12(a2) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a7, 16(sp) ; RV32I-NEXT: sw a2, 20(sp) +; RV32I-NEXT: sw a0, 24(sp) +; RV32I-NEXT: sw a3, 28(sp) +; RV32I-NEXT: sw a4, 32(sp) +; RV32I-NEXT: sw a1, 36(sp) ; RV32I-NEXT: addi a0, sp, 40 ; RV32I-NEXT: addi a1, sp, 24 ; RV32I-NEXT: addi a2, sp, 8 -; RV32I-NEXT: sw a3, 24(sp) -; RV32I-NEXT: sw a4, 28(sp) -; RV32I-NEXT: sw a5, 32(sp) -; RV32I-NEXT: sw a6, 36(sp) ; RV32I-NEXT: call fminimuml ; RV32I-NEXT: lw a0, 40(sp) ; RV32I-NEXT: lw a1, 44(sp) diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll index c5c3b199447a9..2c1503cc162ea 100644 --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -192,8 +192,8 @@ define i32 @ustest_f64i32(double %x) { ; RV32IF-NEXT: .LBB2_3: # %entry ; RV32IF-NEXT: addi a3, a2, -1 ; RV32IF-NEXT: neg a2, a2 -; RV32IF-NEXT: and a1, a2, a1 ; RV32IF-NEXT: or a0, a3, a0 +; RV32IF-NEXT: and a1, a2, a1 ; RV32IF-NEXT: beqz a1, .LBB2_5 ; RV32IF-NEXT: # %bb.4: # %entry ; RV32IF-NEXT: sgtz a1, a1 @@ -501,8 +501,8 @@ define i32 @ustest_f16i32(half %x) { ; RV32-NEXT: .LBB8_3: # %entry ; RV32-NEXT: addi a3, a2, -1 ; RV32-NEXT: neg a2, a2 -; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: or a0, a3, a0 +; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: beqz a1, .LBB8_5 ; RV32-NEXT: # %bb.4: # %entry ; RV32-NEXT: sgtz a1, a1 @@ -1277,20 +1277,20 @@ define i64 @utest_f64i64(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixunsdfti -; RV32IF-NEXT: lw a0, 16(sp) -; RV32IF-NEXT: lw a1, 20(sp) -; RV32IF-NEXT: lw a2, 12(sp) -; RV32IF-NEXT: lw a3, 8(sp) -; RV32IF-NEXT: or a4, a1, a0 -; RV32IF-NEXT: xori a0, a0, 1 +; RV32IF-NEXT: lw a0, 8(sp) +; RV32IF-NEXT: lw a1, 12(sp) +; RV32IF-NEXT: lw a2, 16(sp) +; RV32IF-NEXT: lw a3, 20(sp) +; RV32IF-NEXT: or a4, a3, a2 +; RV32IF-NEXT: xori a2, a2, 1 ; RV32IF-NEXT: seqz a4, a4 -; RV32IF-NEXT: or a0, a0, a1 -; RV32IF-NEXT: seqz a0, a0 -; RV32IF-NEXT: addi a0, a0, -1 -; RV32IF-NEXT: and a0, a0, a4 -; RV32IF-NEXT: neg a1, a0 -; RV32IF-NEXT: and a0, a1, a3 -; RV32IF-NEXT: and a1, a1, a2 +; RV32IF-NEXT: or a2, a2, a3 +; RV32IF-NEXT: seqz a2, a2 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: and a2, a2, a4 +; RV32IF-NEXT: neg a2, a2 +; RV32IF-NEXT: and a0, a2, a0 +; RV32IF-NEXT: and a1, a2, a1 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -1321,20 +1321,20 @@ define i64 @utest_f64i64(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixunsdfti -; RV32IFD-NEXT: lw a0, 16(sp) -; RV32IFD-NEXT: lw a1, 20(sp) -; RV32IFD-NEXT: lw a2, 12(sp) -; RV32IFD-NEXT: lw a3, 8(sp) -; RV32IFD-NEXT: or a4, a1, a0 -; RV32IFD-NEXT: xori a0, a0, 1 +; RV32IFD-NEXT: lw a0, 8(sp) +; RV32IFD-NEXT: lw a1, 12(sp) +; RV32IFD-NEXT: lw a2, 16(sp) +; RV32IFD-NEXT: lw a3, 20(sp) +; RV32IFD-NEXT: or a4, a3, a2 +; RV32IFD-NEXT: xori a2, a2, 1 ; RV32IFD-NEXT: seqz a4, a4 -; RV32IFD-NEXT: or a0, a0, a1 -; RV32IFD-NEXT: seqz a0, a0 -; RV32IFD-NEXT: addi a0, a0, -1 -; RV32IFD-NEXT: and a0, a0, a4 -; RV32IFD-NEXT: neg a1, a0 -; RV32IFD-NEXT: and a0, a1, a3 -; RV32IFD-NEXT: and a1, a1, a2 +; RV32IFD-NEXT: or a2, a2, a3 +; RV32IFD-NEXT: seqz a2, a2 +; RV32IFD-NEXT: addi a2, a2, -1 +; RV32IFD-NEXT: and a2, a2, a4 +; RV32IFD-NEXT: neg a2, a2 +; RV32IFD-NEXT: and a0, a2, a0 +; RV32IFD-NEXT: and a1, a2, a1 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -1359,8 +1359,8 @@ define i64 @ustest_f64i64(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti -; RV32IF-NEXT: lw a1, 20(sp) ; RV32IF-NEXT: lw a0, 16(sp) +; RV32IF-NEXT: lw a1, 20(sp) ; RV32IF-NEXT: beqz a1, .LBB20_2 ; RV32IF-NEXT: # %bb.1: # %entry ; RV32IF-NEXT: slti a2, a1, 0 @@ -1378,8 +1378,8 @@ define i64 @ustest_f64i64(double %x) { ; RV32IF-NEXT: # %bb.4: # %entry ; RV32IF-NEXT: li a0, 1 ; RV32IF-NEXT: .LBB20_5: # %entry -; RV32IF-NEXT: lw a3, 8(sp) -; RV32IF-NEXT: lw a4, 12(sp) +; RV32IF-NEXT: lw a4, 8(sp) +; RV32IF-NEXT: lw a3, 12(sp) ; RV32IF-NEXT: and a5, a2, a1 ; RV32IF-NEXT: beqz a5, .LBB20_7 ; RV32IF-NEXT: # %bb.6: # %entry @@ -1388,17 +1388,17 @@ define i64 @ustest_f64i64(double %x) { ; RV32IF-NEXT: .LBB20_7: ; RV32IF-NEXT: snez a1, a0 ; RV32IF-NEXT: .LBB20_8: # %entry -; RV32IF-NEXT: and a4, a2, a4 +; RV32IF-NEXT: and a3, a2, a3 +; RV32IF-NEXT: and a2, a2, a4 ; RV32IF-NEXT: or a0, a0, a5 -; RV32IF-NEXT: and a2, a2, a3 ; RV32IF-NEXT: bnez a0, .LBB20_10 ; RV32IF-NEXT: # %bb.9: -; RV32IF-NEXT: or a0, a2, a4 +; RV32IF-NEXT: or a0, a2, a3 ; RV32IF-NEXT: snez a1, a0 ; RV32IF-NEXT: .LBB20_10: # %entry ; RV32IF-NEXT: neg a1, a1 ; RV32IF-NEXT: and a0, a1, a2 -; RV32IF-NEXT: and a1, a1, a4 +; RV32IF-NEXT: and a1, a1, a3 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -1442,8 +1442,8 @@ define i64 @ustest_f64i64(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti -; RV32IFD-NEXT: lw a1, 20(sp) ; RV32IFD-NEXT: lw a0, 16(sp) +; RV32IFD-NEXT: lw a1, 20(sp) ; RV32IFD-NEXT: beqz a1, .LBB20_2 ; RV32IFD-NEXT: # %bb.1: # %entry ; RV32IFD-NEXT: slti a2, a1, 0 @@ -1461,8 +1461,8 @@ define i64 @ustest_f64i64(double %x) { ; RV32IFD-NEXT: # %bb.4: # %entry ; RV32IFD-NEXT: li a0, 1 ; RV32IFD-NEXT: .LBB20_5: # %entry -; RV32IFD-NEXT: lw a3, 8(sp) -; RV32IFD-NEXT: lw a4, 12(sp) +; RV32IFD-NEXT: lw a4, 8(sp) +; RV32IFD-NEXT: lw a3, 12(sp) ; RV32IFD-NEXT: and a5, a2, a1 ; RV32IFD-NEXT: beqz a5, .LBB20_7 ; RV32IFD-NEXT: # %bb.6: # %entry @@ -1471,17 +1471,17 @@ define i64 @ustest_f64i64(double %x) { ; RV32IFD-NEXT: .LBB20_7: ; RV32IFD-NEXT: snez a1, a0 ; RV32IFD-NEXT: .LBB20_8: # %entry -; RV32IFD-NEXT: and a4, a2, a4 +; RV32IFD-NEXT: and a3, a2, a3 +; RV32IFD-NEXT: and a2, a2, a4 ; RV32IFD-NEXT: or a0, a0, a5 -; RV32IFD-NEXT: and a2, a2, a3 ; RV32IFD-NEXT: bnez a0, .LBB20_10 ; RV32IFD-NEXT: # %bb.9: -; RV32IFD-NEXT: or a0, a2, a4 +; RV32IFD-NEXT: or a0, a2, a3 ; RV32IFD-NEXT: snez a1, a0 ; RV32IFD-NEXT: .LBB20_10: # %entry ; RV32IFD-NEXT: neg a1, a1 ; RV32IFD-NEXT: and a0, a1, a2 -; RV32IFD-NEXT: and a1, a1, a4 +; RV32IFD-NEXT: and a1, a1, a3 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -1587,20 +1587,20 @@ define i64 @utest_f32i64(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixunssfti -; RV32-NEXT: lw a0, 16(sp) -; RV32-NEXT: lw a1, 20(sp) -; RV32-NEXT: lw a2, 12(sp) -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a3, 20(sp) +; RV32-NEXT: or a4, a3, a2 +; RV32-NEXT: xori a2, a2, 1 ; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: or a0, a0, a1 -; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a4 -; RV32-NEXT: neg a1, a0 -; RV32-NEXT: and a0, a1, a3 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: seqz a2, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a2, a2, a4 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -1639,8 +1639,8 @@ define i64 @ustest_f32i64(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a1, 20(sp) ; RV32-NEXT: lw a0, 16(sp) +; RV32-NEXT: lw a1, 20(sp) ; RV32-NEXT: beqz a1, .LBB23_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: slti a2, a1, 0 @@ -1658,8 +1658,8 @@ define i64 @ustest_f32i64(float %x) { ; RV32-NEXT: # %bb.4: # %entry ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB23_5: # %entry -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: lw a4, 12(sp) +; RV32-NEXT: lw a4, 8(sp) +; RV32-NEXT: lw a3, 12(sp) ; RV32-NEXT: and a5, a2, a1 ; RV32-NEXT: beqz a5, .LBB23_7 ; RV32-NEXT: # %bb.6: # %entry @@ -1668,17 +1668,17 @@ define i64 @ustest_f32i64(float %x) { ; RV32-NEXT: .LBB23_7: ; RV32-NEXT: snez a1, a0 ; RV32-NEXT: .LBB23_8: # %entry -; RV32-NEXT: and a4, a2, a4 +; RV32-NEXT: and a3, a2, a3 +; RV32-NEXT: and a2, a2, a4 ; RV32-NEXT: or a0, a0, a5 -; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: bnez a0, .LBB23_10 ; RV32-NEXT: # %bb.9: -; RV32-NEXT: or a0, a2, a4 +; RV32-NEXT: or a0, a2, a3 ; RV32-NEXT: snez a1, a0 ; RV32-NEXT: .LBB23_10: # %entry ; RV32-NEXT: neg a1, a1 ; RV32-NEXT: and a0, a1, a2 -; RV32-NEXT: and a1, a1, a4 +; RV32-NEXT: and a1, a1, a3 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -1848,20 +1848,20 @@ define i64 @utesth_f16i64(half %x) { ; RV32-NEXT: call __extendhfsf2 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixunssfti -; RV32-NEXT: lw a0, 16(sp) -; RV32-NEXT: lw a1, 20(sp) -; RV32-NEXT: lw a2, 12(sp) -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a3, 20(sp) +; RV32-NEXT: or a4, a3, a2 +; RV32-NEXT: xori a2, a2, 1 ; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: or a0, a0, a1 -; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a4 -; RV32-NEXT: neg a1, a0 -; RV32-NEXT: and a0, a1, a3 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: seqz a2, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a2, a2, a4 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -1902,8 +1902,8 @@ define i64 @ustest_f16i64(half %x) { ; RV32-NEXT: call __extendhfsf2 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a1, 20(sp) ; RV32-NEXT: lw a0, 16(sp) +; RV32-NEXT: lw a1, 20(sp) ; RV32-NEXT: beqz a1, .LBB26_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: slti a2, a1, 0 @@ -1921,8 +1921,8 @@ define i64 @ustest_f16i64(half %x) { ; RV32-NEXT: # %bb.4: # %entry ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB26_5: # %entry -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: lw a4, 12(sp) +; RV32-NEXT: lw a4, 8(sp) +; RV32-NEXT: lw a3, 12(sp) ; RV32-NEXT: and a5, a2, a1 ; RV32-NEXT: beqz a5, .LBB26_7 ; RV32-NEXT: # %bb.6: # %entry @@ -1931,17 +1931,17 @@ define i64 @ustest_f16i64(half %x) { ; RV32-NEXT: .LBB26_7: ; RV32-NEXT: snez a1, a0 ; RV32-NEXT: .LBB26_8: # %entry -; RV32-NEXT: and a4, a2, a4 +; RV32-NEXT: and a3, a2, a3 +; RV32-NEXT: and a2, a2, a4 ; RV32-NEXT: or a0, a0, a5 -; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: bnez a0, .LBB26_10 ; RV32-NEXT: # %bb.9: -; RV32-NEXT: or a0, a2, a4 +; RV32-NEXT: or a0, a2, a3 ; RV32-NEXT: snez a1, a0 ; RV32-NEXT: .LBB26_10: # %entry ; RV32-NEXT: neg a1, a1 ; RV32-NEXT: and a0, a1, a2 -; RV32-NEXT: and a1, a1, a4 +; RV32-NEXT: and a1, a1, a3 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -3211,20 +3211,20 @@ define i64 @utest_f64i64_mm(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixunsdfti -; RV32IF-NEXT: lw a0, 16(sp) -; RV32IF-NEXT: lw a1, 20(sp) -; RV32IF-NEXT: lw a2, 12(sp) -; RV32IF-NEXT: lw a3, 8(sp) -; RV32IF-NEXT: or a4, a1, a0 -; RV32IF-NEXT: xori a0, a0, 1 +; RV32IF-NEXT: lw a0, 8(sp) +; RV32IF-NEXT: lw a1, 12(sp) +; RV32IF-NEXT: lw a2, 16(sp) +; RV32IF-NEXT: lw a3, 20(sp) +; RV32IF-NEXT: or a4, a3, a2 +; RV32IF-NEXT: xori a2, a2, 1 ; RV32IF-NEXT: seqz a4, a4 -; RV32IF-NEXT: or a0, a0, a1 -; RV32IF-NEXT: seqz a0, a0 -; RV32IF-NEXT: addi a0, a0, -1 -; RV32IF-NEXT: and a0, a0, a4 -; RV32IF-NEXT: neg a1, a0 -; RV32IF-NEXT: and a0, a1, a3 -; RV32IF-NEXT: and a1, a1, a2 +; RV32IF-NEXT: or a2, a2, a3 +; RV32IF-NEXT: seqz a2, a2 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: and a2, a2, a4 +; RV32IF-NEXT: neg a2, a2 +; RV32IF-NEXT: and a0, a2, a0 +; RV32IF-NEXT: and a1, a2, a1 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -3255,20 +3255,20 @@ define i64 @utest_f64i64_mm(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixunsdfti -; RV32IFD-NEXT: lw a0, 16(sp) -; RV32IFD-NEXT: lw a1, 20(sp) -; RV32IFD-NEXT: lw a2, 12(sp) -; RV32IFD-NEXT: lw a3, 8(sp) -; RV32IFD-NEXT: or a4, a1, a0 -; RV32IFD-NEXT: xori a0, a0, 1 +; RV32IFD-NEXT: lw a0, 8(sp) +; RV32IFD-NEXT: lw a1, 12(sp) +; RV32IFD-NEXT: lw a2, 16(sp) +; RV32IFD-NEXT: lw a3, 20(sp) +; RV32IFD-NEXT: or a4, a3, a2 +; RV32IFD-NEXT: xori a2, a2, 1 ; RV32IFD-NEXT: seqz a4, a4 -; RV32IFD-NEXT: or a0, a0, a1 -; RV32IFD-NEXT: seqz a0, a0 -; RV32IFD-NEXT: addi a0, a0, -1 -; RV32IFD-NEXT: and a0, a0, a4 -; RV32IFD-NEXT: neg a1, a0 -; RV32IFD-NEXT: and a0, a1, a3 -; RV32IFD-NEXT: and a1, a1, a2 +; RV32IFD-NEXT: or a2, a2, a3 +; RV32IFD-NEXT: seqz a2, a2 +; RV32IFD-NEXT: addi a2, a2, -1 +; RV32IFD-NEXT: and a2, a2, a4 +; RV32IFD-NEXT: neg a2, a2 +; RV32IFD-NEXT: and a0, a2, a0 +; RV32IFD-NEXT: and a1, a2, a1 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti -; RV32IF-NEXT: lw a0, 20(sp) -; RV32IF-NEXT: lw a1, 8(sp) +; RV32IF-NEXT: lw a0, 8(sp) ; RV32IF-NEXT: lw a2, 12(sp) ; RV32IF-NEXT: lw a3, 16(sp) -; RV32IF-NEXT: beqz a0, .LBB47_2 +; RV32IF-NEXT: lw a1, 20(sp) +; RV32IF-NEXT: beqz a1, .LBB47_2 ; RV32IF-NEXT: # %bb.1: # %entry -; RV32IF-NEXT: slti a4, a0, 0 +; RV32IF-NEXT: slti a4, a1, 0 ; RV32IF-NEXT: j .LBB47_3 ; RV32IF-NEXT: .LBB47_2: ; RV32IF-NEXT: seqz a4, a3 ; RV32IF-NEXT: .LBB47_3: # %entry ; RV32IF-NEXT: xori a3, a3, 1 -; RV32IF-NEXT: or a3, a3, a0 +; RV32IF-NEXT: or a3, a3, a1 ; RV32IF-NEXT: seqz a3, a3 ; RV32IF-NEXT: addi a3, a3, -1 ; RV32IF-NEXT: and a3, a3, a4 ; RV32IF-NEXT: neg a3, a3 ; RV32IF-NEXT: and a2, a3, a2 -; RV32IF-NEXT: and a1, a3, a1 ; RV32IF-NEXT: and a0, a3, a0 -; RV32IF-NEXT: slti a0, a0, 0 -; RV32IF-NEXT: addi a3, a0, -1 -; RV32IF-NEXT: and a0, a3, a1 -; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: and a1, a3, a1 +; RV32IF-NEXT: slti a1, a1, 0 +; RV32IF-NEXT: addi a1, a1, -1 +; RV32IF-NEXT: and a0, a1, a0 +; RV32IF-NEXT: and a1, a1, a2 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti -; RV32IFD-NEXT: lw a0, 20(sp) -; RV32IFD-NEXT: lw a1, 8(sp) +; RV32IFD-NEXT: lw a0, 8(sp) ; RV32IFD-NEXT: lw a2, 12(sp) ; RV32IFD-NEXT: lw a3, 16(sp) -; RV32IFD-NEXT: beqz a0, .LBB47_2 +; RV32IFD-NEXT: lw a1, 20(sp) +; RV32IFD-NEXT: beqz a1, .LBB47_2 ; RV32IFD-NEXT: # %bb.1: # %entry -; RV32IFD-NEXT: slti a4, a0, 0 +; RV32IFD-NEXT: slti a4, a1, 0 ; RV32IFD-NEXT: j .LBB47_3 ; RV32IFD-NEXT: .LBB47_2: ; RV32IFD-NEXT: seqz a4, a3 ; RV32IFD-NEXT: .LBB47_3: # %entry ; RV32IFD-NEXT: xori a3, a3, 1 -; RV32IFD-NEXT: or a3, a3, a0 +; RV32IFD-NEXT: or a3, a3, a1 ; RV32IFD-NEXT: seqz a3, a3 ; RV32IFD-NEXT: addi a3, a3, -1 ; RV32IFD-NEXT: and a3, a3, a4 ; RV32IFD-NEXT: neg a3, a3 ; RV32IFD-NEXT: and a2, a3, a2 -; RV32IFD-NEXT: and a1, a3, a1 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: slti a0, a0, 0 -; RV32IFD-NEXT: addi a3, a0, -1 -; RV32IFD-NEXT: and a0, a3, a1 -; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: and a1, a3, a1 +; RV32IFD-NEXT: slti a1, a1, 0 +; RV32IFD-NEXT: addi a1, a1, -1 +; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: and a1, a1, a2 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -3479,20 +3479,20 @@ define i64 @utest_f32i64_mm(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixunssfti -; RV32-NEXT: lw a0, 16(sp) -; RV32-NEXT: lw a1, 20(sp) -; RV32-NEXT: lw a2, 12(sp) -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a3, 20(sp) +; RV32-NEXT: or a4, a3, a2 +; RV32-NEXT: xori a2, a2, 1 ; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: or a0, a0, a1 -; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a4 -; RV32-NEXT: neg a1, a0 -; RV32-NEXT: and a0, a1, a3 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: seqz a2, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a2, a2, a4 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a0, 20(sp) -; RV32-NEXT: lw a1, 8(sp) +; RV32-NEXT: lw a0, 8(sp) ; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a0, .LBB50_2 +; RV32-NEXT: lw a1, 20(sp) +; RV32-NEXT: beqz a1, .LBB50_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a0, 0 +; RV32-NEXT: slti a4, a1, 0 ; RV32-NEXT: j .LBB50_3 ; RV32-NEXT: .LBB50_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB50_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a0 +; RV32-NEXT: or a3, a3, a1 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 ; RV32-NEXT: and a2, a3, a2 -; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: slti a0, a0, 0 -; RV32-NEXT: addi a3, a0, -1 -; RV32-NEXT: and a0, a3, a1 -; RV32-NEXT: and a1, a3, a2 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: slti a1, a1, 0 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -3714,20 +3714,20 @@ define i64 @utesth_f16i64_mm(half %x) { ; RV32-NEXT: call __extendhfsf2 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixunssfti -; RV32-NEXT: lw a0, 16(sp) -; RV32-NEXT: lw a1, 20(sp) -; RV32-NEXT: lw a2, 12(sp) -; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: lw a2, 16(sp) +; RV32-NEXT: lw a3, 20(sp) +; RV32-NEXT: or a4, a3, a2 +; RV32-NEXT: xori a2, a2, 1 ; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: or a0, a0, a1 -; RV32-NEXT: seqz a0, a0 -; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: and a0, a0, a4 -; RV32-NEXT: neg a1, a0 -; RV32-NEXT: and a0, a1, a3 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: or a2, a2, a3 +; RV32-NEXT: seqz a2, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a2, a2, a4 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) { ; RV32-NEXT: call __extendhfsf2 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a0, 20(sp) -; RV32-NEXT: lw a1, 8(sp) +; RV32-NEXT: lw a0, 8(sp) ; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a0, .LBB53_2 +; RV32-NEXT: lw a1, 20(sp) +; RV32-NEXT: beqz a1, .LBB53_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a0, 0 +; RV32-NEXT: slti a4, a1, 0 ; RV32-NEXT: j .LBB53_3 ; RV32-NEXT: .LBB53_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB53_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a0 +; RV32-NEXT: or a3, a3, a1 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 ; RV32-NEXT: and a2, a3, a2 -; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: slti a0, a0, 0 -; RV32-NEXT: addi a3, a0, -1 -; RV32-NEXT: and a0, a3, a1 -; RV32-NEXT: and a1, a3, a2 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: slti a1, a1, 0 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 diff --git a/llvm/test/CodeGen/RISCV/get-setcc-result-type.ll b/llvm/test/CodeGen/RISCV/get-setcc-result-type.ll index 9322abcfbbdce..9ca527573e0c6 100644 --- a/llvm/test/CodeGen/RISCV/get-setcc-result-type.ll +++ b/llvm/test/CodeGen/RISCV/get-setcc-result-type.ll @@ -5,22 +5,22 @@ define void @getSetCCResultType(ptr %p, ptr %q) nounwind { ; RV32I-LABEL: getSetCCResultType: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lw a1, 12(a0) -; RV32I-NEXT: lw a2, 8(a0) -; RV32I-NEXT: lw a3, 4(a0) -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: snez a1, a1 -; RV32I-NEXT: snez a2, a2 -; RV32I-NEXT: snez a3, a3 +; RV32I-NEXT: lw a1, 0(a0) +; RV32I-NEXT: lw a2, 4(a0) +; RV32I-NEXT: lw a3, 8(a0) +; RV32I-NEXT: lw a4, 12(a0) ; RV32I-NEXT: snez a4, a4 -; RV32I-NEXT: addi a4, a4, -1 -; RV32I-NEXT: addi a3, a3, -1 -; RV32I-NEXT: addi a2, a2, -1 +; RV32I-NEXT: snez a3, a3 +; RV32I-NEXT: snez a2, a2 +; RV32I-NEXT: snez a1, a1 ; RV32I-NEXT: addi a1, a1, -1 -; RV32I-NEXT: sw a4, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: addi a2, a2, -1 +; RV32I-NEXT: addi a3, a3, -1 +; RV32I-NEXT: addi a4, a4, -1 +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a3, 8(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret entry: %0 = load <4 x i32>, ptr %p, align 16 diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll index a218e89948d4b..690bf6c284eb2 100644 --- a/llvm/test/CodeGen/RISCV/half-arith.ll +++ b/llvm/test/CodeGen/RISCV/half-arith.ll @@ -2885,14 +2885,14 @@ define half @fsgnjx_f16(half %x, half %y) nounwind { ; ; RV32IZFHMIN-LABEL: fsgnjx_f16: ; RV32IZFHMIN: # %bb.0: -; RV32IZFHMIN-NEXT: lui a0, %hi(.LCPI23_0) -; RV32IZFHMIN-NEXT: lhu a0, %lo(.LCPI23_0)(a0) -; RV32IZFHMIN-NEXT: fmv.x.h a1, fa0 -; RV32IZFHMIN-NEXT: lui a2, 1048568 -; RV32IZFHMIN-NEXT: and a1, a1, a2 -; RV32IZFHMIN-NEXT: slli a0, a0, 17 -; RV32IZFHMIN-NEXT: srli a0, a0, 17 -; RV32IZFHMIN-NEXT: or a0, a0, a1 +; RV32IZFHMIN-NEXT: fmv.x.h a0, fa0 +; RV32IZFHMIN-NEXT: lui a1, 1048568 +; RV32IZFHMIN-NEXT: and a0, a0, a1 +; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI23_0) +; RV32IZFHMIN-NEXT: lhu a1, %lo(.LCPI23_0)(a1) +; RV32IZFHMIN-NEXT: slli a1, a1, 17 +; RV32IZFHMIN-NEXT: srli a1, a1, 17 +; RV32IZFHMIN-NEXT: or a0, a1, a0 ; RV32IZFHMIN-NEXT: fmv.h.x fa5, a0 ; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 ; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa1 @@ -2902,14 +2902,14 @@ define half @fsgnjx_f16(half %x, half %y) nounwind { ; ; RV64IZFHMIN-LABEL: fsgnjx_f16: ; RV64IZFHMIN: # %bb.0: -; RV64IZFHMIN-NEXT: lui a0, %hi(.LCPI23_0) -; RV64IZFHMIN-NEXT: lhu a0, %lo(.LCPI23_0)(a0) -; RV64IZFHMIN-NEXT: fmv.x.h a1, fa0 -; RV64IZFHMIN-NEXT: lui a2, 1048568 -; RV64IZFHMIN-NEXT: and a1, a1, a2 -; RV64IZFHMIN-NEXT: slli a0, a0, 49 -; RV64IZFHMIN-NEXT: srli a0, a0, 49 -; RV64IZFHMIN-NEXT: or a0, a0, a1 +; RV64IZFHMIN-NEXT: fmv.x.h a0, fa0 +; RV64IZFHMIN-NEXT: lui a1, 1048568 +; RV64IZFHMIN-NEXT: and a0, a0, a1 +; RV64IZFHMIN-NEXT: lui a1, %hi(.LCPI23_0) +; RV64IZFHMIN-NEXT: lhu a1, %lo(.LCPI23_0)(a1) +; RV64IZFHMIN-NEXT: slli a1, a1, 49 +; RV64IZFHMIN-NEXT: srli a1, a1, 49 +; RV64IZFHMIN-NEXT: or a0, a1, a0 ; RV64IZFHMIN-NEXT: fmv.h.x fa5, a0 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa1 diff --git a/llvm/test/CodeGen/RISCV/half-convert-strict.ll b/llvm/test/CodeGen/RISCV/half-convert-strict.ll index 0a04d44893e75..5396fab3437c7 100644 --- a/llvm/test/CodeGen/RISCV/half-convert-strict.ll +++ b/llvm/test/CodeGen/RISCV/half-convert-strict.ll @@ -2519,12 +2519,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp { ; RV32IZFH-NEXT: lw a1, 0(a0) ; RV32IZFH-NEXT: lw a2, 4(a0) ; RV32IZFH-NEXT: lw a3, 8(a0) -; RV32IZFH-NEXT: lw a4, 12(a0) -; RV32IZFH-NEXT: addi a0, sp, 8 +; RV32IZFH-NEXT: lw a0, 12(a0) ; RV32IZFH-NEXT: sw a1, 8(sp) ; RV32IZFH-NEXT: sw a2, 12(sp) ; RV32IZFH-NEXT: sw a3, 16(sp) -; RV32IZFH-NEXT: sw a4, 20(sp) +; RV32IZFH-NEXT: sw a0, 20(sp) +; RV32IZFH-NEXT: addi a0, sp, 8 ; RV32IZFH-NEXT: call __trunctfhf2 ; RV32IZFH-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: addi sp, sp, 32 @@ -2546,12 +2546,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp { ; RV32IZHINX-NEXT: lw a1, 0(a0) ; RV32IZHINX-NEXT: lw a2, 4(a0) ; RV32IZHINX-NEXT: lw a3, 8(a0) -; RV32IZHINX-NEXT: lw a4, 12(a0) -; RV32IZHINX-NEXT: addi a0, sp, 8 +; RV32IZHINX-NEXT: lw a0, 12(a0) ; RV32IZHINX-NEXT: sw a1, 8(sp) ; RV32IZHINX-NEXT: sw a2, 12(sp) ; RV32IZHINX-NEXT: sw a3, 16(sp) -; RV32IZHINX-NEXT: sw a4, 20(sp) +; RV32IZHINX-NEXT: sw a0, 20(sp) +; RV32IZHINX-NEXT: addi a0, sp, 8 ; RV32IZHINX-NEXT: call __trunctfhf2 ; RV32IZHINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: addi sp, sp, 32 @@ -2573,12 +2573,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp { ; RV32IDZFH-NEXT: lw a1, 0(a0) ; RV32IDZFH-NEXT: lw a2, 4(a0) ; RV32IDZFH-NEXT: lw a3, 8(a0) -; RV32IDZFH-NEXT: lw a4, 12(a0) -; RV32IDZFH-NEXT: addi a0, sp, 8 +; RV32IDZFH-NEXT: lw a0, 12(a0) ; RV32IDZFH-NEXT: sw a1, 8(sp) ; RV32IDZFH-NEXT: sw a2, 12(sp) ; RV32IDZFH-NEXT: sw a3, 16(sp) -; RV32IDZFH-NEXT: sw a4, 20(sp) +; RV32IDZFH-NEXT: sw a0, 20(sp) +; RV32IDZFH-NEXT: addi a0, sp, 8 ; RV32IDZFH-NEXT: call __trunctfhf2 ; RV32IDZFH-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IDZFH-NEXT: addi sp, sp, 32 @@ -2600,12 +2600,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp { ; RV32IZDINXZHINX-NEXT: lw a1, 0(a0) ; RV32IZDINXZHINX-NEXT: lw a2, 4(a0) ; RV32IZDINXZHINX-NEXT: lw a3, 8(a0) -; RV32IZDINXZHINX-NEXT: lw a4, 12(a0) -; RV32IZDINXZHINX-NEXT: addi a0, sp, 8 +; RV32IZDINXZHINX-NEXT: lw a0, 12(a0) ; RV32IZDINXZHINX-NEXT: sw a1, 8(sp) ; RV32IZDINXZHINX-NEXT: sw a2, 12(sp) ; RV32IZDINXZHINX-NEXT: sw a3, 16(sp) -; RV32IZDINXZHINX-NEXT: sw a4, 20(sp) +; RV32IZDINXZHINX-NEXT: sw a0, 20(sp) +; RV32IZDINXZHINX-NEXT: addi a0, sp, 8 ; RV32IZDINXZHINX-NEXT: call __trunctfhf2 ; RV32IZDINXZHINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IZDINXZHINX-NEXT: addi sp, sp, 32 @@ -2627,12 +2627,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp { ; CHECK32-IZFHMIN-NEXT: lw a1, 0(a0) ; CHECK32-IZFHMIN-NEXT: lw a2, 4(a0) ; CHECK32-IZFHMIN-NEXT: lw a3, 8(a0) -; CHECK32-IZFHMIN-NEXT: lw a4, 12(a0) -; CHECK32-IZFHMIN-NEXT: addi a0, sp, 8 +; CHECK32-IZFHMIN-NEXT: lw a0, 12(a0) ; CHECK32-IZFHMIN-NEXT: sw a1, 8(sp) ; CHECK32-IZFHMIN-NEXT: sw a2, 12(sp) ; CHECK32-IZFHMIN-NEXT: sw a3, 16(sp) -; CHECK32-IZFHMIN-NEXT: sw a4, 20(sp) +; CHECK32-IZFHMIN-NEXT: sw a0, 20(sp) +; CHECK32-IZFHMIN-NEXT: addi a0, sp, 8 ; CHECK32-IZFHMIN-NEXT: call __trunctfhf2 ; CHECK32-IZFHMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; CHECK32-IZFHMIN-NEXT: addi sp, sp, 32 @@ -2654,12 +2654,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp { ; CHECK32-IZHINXMIN-NEXT: lw a1, 0(a0) ; CHECK32-IZHINXMIN-NEXT: lw a2, 4(a0) ; CHECK32-IZHINXMIN-NEXT: lw a3, 8(a0) -; CHECK32-IZHINXMIN-NEXT: lw a4, 12(a0) -; CHECK32-IZHINXMIN-NEXT: addi a0, sp, 8 +; CHECK32-IZHINXMIN-NEXT: lw a0, 12(a0) ; CHECK32-IZHINXMIN-NEXT: sw a1, 8(sp) ; CHECK32-IZHINXMIN-NEXT: sw a2, 12(sp) ; CHECK32-IZHINXMIN-NEXT: sw a3, 16(sp) -; CHECK32-IZHINXMIN-NEXT: sw a4, 20(sp) +; CHECK32-IZHINXMIN-NEXT: sw a0, 20(sp) +; CHECK32-IZHINXMIN-NEXT: addi a0, sp, 8 ; CHECK32-IZHINXMIN-NEXT: call __trunctfhf2 ; CHECK32-IZHINXMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; CHECK32-IZHINXMIN-NEXT: addi sp, sp, 32 @@ -2681,12 +2681,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp { ; CHECK32-IZDINXZHINXMIN-NEXT: lw a1, 0(a0) ; CHECK32-IZDINXZHINXMIN-NEXT: lw a2, 4(a0) ; CHECK32-IZDINXZHINXMIN-NEXT: lw a3, 8(a0) -; CHECK32-IZDINXZHINXMIN-NEXT: lw a4, 12(a0) -; CHECK32-IZDINXZHINXMIN-NEXT: addi a0, sp, 8 +; CHECK32-IZDINXZHINXMIN-NEXT: lw a0, 12(a0) ; CHECK32-IZDINXZHINXMIN-NEXT: sw a1, 8(sp) ; CHECK32-IZDINXZHINXMIN-NEXT: sw a2, 12(sp) ; CHECK32-IZDINXZHINXMIN-NEXT: sw a3, 16(sp) -; CHECK32-IZDINXZHINXMIN-NEXT: sw a4, 20(sp) +; CHECK32-IZDINXZHINXMIN-NEXT: sw a0, 20(sp) +; CHECK32-IZDINXZHINXMIN-NEXT: addi a0, sp, 8 ; CHECK32-IZDINXZHINXMIN-NEXT: call __trunctfhf2 ; CHECK32-IZDINXZHINXMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; CHECK32-IZDINXZHINXMIN-NEXT: addi sp, sp, 32 @@ -2708,12 +2708,12 @@ define half @fcvt_h_q(fp128 %a) nounwind strictfp { ; CHECK32-D-NEXT: lw a1, 0(a0) ; CHECK32-D-NEXT: lw a2, 4(a0) ; CHECK32-D-NEXT: lw a3, 8(a0) -; CHECK32-D-NEXT: lw a4, 12(a0) -; CHECK32-D-NEXT: addi a0, sp, 8 +; CHECK32-D-NEXT: lw a0, 12(a0) ; CHECK32-D-NEXT: sw a1, 8(sp) ; CHECK32-D-NEXT: sw a2, 12(sp) ; CHECK32-D-NEXT: sw a3, 16(sp) -; CHECK32-D-NEXT: sw a4, 20(sp) +; CHECK32-D-NEXT: sw a0, 20(sp) +; CHECK32-D-NEXT: addi a0, sp, 8 ; CHECK32-D-NEXT: call __trunctfhf2 ; CHECK32-D-NEXT: fmv.x.w a0, fa0 ; CHECK32-D-NEXT: lui a1, 1048560 diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll index cf57ecd6cd1e4..7841f0209ce24 100644 --- a/llvm/test/CodeGen/RISCV/half-convert.ll +++ b/llvm/test/CodeGen/RISCV/half-convert.ll @@ -194,13 +194,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_si_h_sat: ; RV32IZFH: # %bb.0: # %start ; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IZFH-NEXT: lui a0, %hi(.LCPI1_0) -; RV32IZFH-NEXT: feq.s a1, fa5, fa5 -; RV32IZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV32IZFH-NEXT: lui a0, 815104 -; RV32IZFH-NEXT: fmv.w.x fa3, a0 -; RV32IZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV32IZFH-NEXT: neg a0, a1 +; RV32IZFH-NEXT: lui a1, %hi(.LCPI1_0) +; RV32IZFH-NEXT: fmv.w.x fa4, a0 +; RV32IZFH-NEXT: feq.s a0, fa5, fa5 +; RV32IZFH-NEXT: neg a0, a0 +; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IZFH-NEXT: and a0, a0, a1 @@ -209,13 +209,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64IZFH-LABEL: fcvt_si_h_sat: ; RV64IZFH: # %bb.0: # %start ; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IZFH-NEXT: lui a0, %hi(.LCPI1_0) -; RV64IZFH-NEXT: feq.s a1, fa5, fa5 -; RV64IZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV64IZFH-NEXT: lui a0, 815104 -; RV64IZFH-NEXT: fmv.w.x fa3, a0 -; RV64IZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV64IZFH-NEXT: neg a0, a1 +; RV64IZFH-NEXT: lui a1, %hi(.LCPI1_0) +; RV64IZFH-NEXT: fmv.w.x fa4, a0 +; RV64IZFH-NEXT: feq.s a0, fa5, fa5 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IZFH-NEXT: and a0, a0, a1 @@ -224,13 +224,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32IDZFH-LABEL: fcvt_si_h_sat: ; RV32IDZFH: # %bb.0: # %start ; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IDZFH-NEXT: lui a0, %hi(.LCPI1_0) -; RV32IDZFH-NEXT: feq.s a1, fa5, fa5 -; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV32IDZFH-NEXT: lui a0, 815104 -; RV32IDZFH-NEXT: fmv.w.x fa3, a0 -; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV32IDZFH-NEXT: neg a0, a1 +; RV32IDZFH-NEXT: lui a1, %hi(.LCPI1_0) +; RV32IDZFH-NEXT: fmv.w.x fa4, a0 +; RV32IDZFH-NEXT: feq.s a0, fa5, fa5 +; RV32IDZFH-NEXT: neg a0, a0 +; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IDZFH-NEXT: and a0, a0, a1 @@ -239,13 +239,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64IDZFH-LABEL: fcvt_si_h_sat: ; RV64IDZFH: # %bb.0: # %start ; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IDZFH-NEXT: lui a0, %hi(.LCPI1_0) -; RV64IDZFH-NEXT: feq.s a1, fa5, fa5 -; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV64IDZFH-NEXT: lui a0, 815104 -; RV64IDZFH-NEXT: fmv.w.x fa3, a0 -; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV64IDZFH-NEXT: neg a0, a1 +; RV64IDZFH-NEXT: lui a1, %hi(.LCPI1_0) +; RV64IDZFH-NEXT: fmv.w.x fa4, a0 +; RV64IDZFH-NEXT: feq.s a0, fa5, fa5 +; RV64IDZFH-NEXT: neg a0, a0 +; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IDZFH-NEXT: and a0, a0, a1 @@ -399,13 +399,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 ; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 -; RV32ID-ILP32-NEXT: lui a0, %hi(.LCPI1_0) -; RV32ID-ILP32-NEXT: feq.s a1, fa5, fa5 -; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV32ID-ILP32-NEXT: lui a0, 815104 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, a0 -; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa3 -; RV32ID-ILP32-NEXT: neg a0, a1 +; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI1_0) +; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 +; RV32ID-ILP32-NEXT: feq.s a0, fa5, fa5 +; RV32ID-ILP32-NEXT: neg a0, a0 +; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 +; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-ILP32-NEXT: and a0, a0, a1 @@ -419,13 +419,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 ; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 -; RV64ID-LP64-NEXT: lui a0, %hi(.LCPI1_0) -; RV64ID-LP64-NEXT: feq.s a1, fa5, fa5 -; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV64ID-LP64-NEXT: lui a0, 815104 -; RV64ID-LP64-NEXT: fmv.w.x fa3, a0 -; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa3 -; RV64ID-LP64-NEXT: neg a0, a1 +; RV64ID-LP64-NEXT: lui a1, %hi(.LCPI1_0) +; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 +; RV64ID-LP64-NEXT: feq.s a0, fa5, fa5 +; RV64ID-LP64-NEXT: neg a0, a0 +; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 +; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-LP64-NEXT: and a0, a0, a1 @@ -439,13 +439,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 ; RV32ID-NEXT: feq.s a0, fa0, fa0 -; RV32ID-NEXT: lui a1, %hi(.LCPI1_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI1_0)(a1) ; RV32ID-NEXT: lui a1, 815104 -; RV32ID-NEXT: fmv.w.x fa4, a1 -; RV32ID-NEXT: fmax.s fa4, fa0, fa4 +; RV32ID-NEXT: fmv.w.x fa5, a1 +; RV32ID-NEXT: lui a1, %hi(.LCPI1_0) ; RV32ID-NEXT: neg a0, a0 -; RV32ID-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-NEXT: fmax.s fa5, fa0, fa5 +; RV32ID-NEXT: flw fa4, %lo(.LCPI1_0)(a1) +; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-NEXT: and a0, a0, a1 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -458,13 +458,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 ; RV64ID-NEXT: feq.s a0, fa0, fa0 -; RV64ID-NEXT: lui a1, %hi(.LCPI1_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI1_0)(a1) ; RV64ID-NEXT: lui a1, 815104 -; RV64ID-NEXT: fmv.w.x fa4, a1 -; RV64ID-NEXT: fmax.s fa4, fa0, fa4 +; RV64ID-NEXT: fmv.w.x fa5, a1 +; RV64ID-NEXT: lui a1, %hi(.LCPI1_0) ; RV64ID-NEXT: neg a0, a0 -; RV64ID-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-NEXT: fmax.s fa5, fa0, fa5 +; RV64ID-NEXT: flw fa4, %lo(.LCPI1_0)(a1) +; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-NEXT: and a0, a0, a1 ; RV64ID-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -474,13 +474,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK32-IZFHMIN-LABEL: fcvt_si_h_sat: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK32-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; CHECK32-IZFHMIN-NEXT: lui a0, 815104 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, a0 -; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK32-IZFHMIN-NEXT: neg a0, a1 +; CHECK32-IZFHMIN-NEXT: lui a1, %hi(.LCPI1_0) +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK32-IZFHMIN-NEXT: neg a0, a0 +; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz ; CHECK32-IZFHMIN-NEXT: and a0, a0, a1 @@ -489,13 +489,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK64-IZFHMIN-LABEL: fcvt_si_h_sat: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; CHECK64-IZFHMIN-NEXT: lui a0, 815104 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, a0 -; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK64-IZFHMIN-NEXT: neg a0, a1 +; CHECK64-IZFHMIN-NEXT: lui a1, %hi(.LCPI1_0) +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: neg a0, a0 +; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a1) ; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz ; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 @@ -711,45 +711,45 @@ define i16 @fcvt_ui_h(half %a) nounwind { define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_ui_h_sat: ; RV32IZFH: # %bb.0: # %start +; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 +; RV32IZFH-NEXT: fmv.w.x fa4, zero ; RV32IZFH-NEXT: lui a0, %hi(.LCPI3_0) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV32IZFH-NEXT: fcvt.s.h fa4, fa0 -; RV32IZFH-NEXT: fmv.w.x fa3, zero -; RV32IZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV32IZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IZFH-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32IZFH-NEXT: ret ; ; RV64IZFH-LABEL: fcvt_ui_h_sat: ; RV64IZFH: # %bb.0: # %start +; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 +; RV64IZFH-NEXT: fmv.w.x fa4, zero ; RV64IZFH-NEXT: lui a0, %hi(.LCPI3_0) -; RV64IZFH-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV64IZFH-NEXT: fcvt.s.h fa4, fa0 -; RV64IZFH-NEXT: fmv.w.x fa3, zero -; RV64IZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV64IZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IZFH-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64IZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_ui_h_sat: ; RV32IDZFH: # %bb.0: # %start +; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 +; RV32IDZFH-NEXT: fmv.w.x fa4, zero ; RV32IDZFH-NEXT: lui a0, %hi(.LCPI3_0) -; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV32IDZFH-NEXT: fcvt.s.h fa4, fa0 -; RV32IDZFH-NEXT: fmv.w.x fa3, zero -; RV32IDZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV32IDZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32IDZFH-NEXT: ret ; ; RV64IDZFH-LABEL: fcvt_ui_h_sat: ; RV64IDZFH: # %bb.0: # %start +; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 +; RV64IDZFH-NEXT: fmv.w.x fa4, zero ; RV64IDZFH-NEXT: lui a0, %hi(.LCPI3_0) -; RV64IDZFH-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV64IDZFH-NEXT: fcvt.s.h fa4, fa0 -; RV64IDZFH-NEXT: fmv.w.x fa3, zero -; RV64IDZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV64IDZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64IDZFH-NEXT: ret ; @@ -874,12 +874,12 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: addi sp, sp, -16 ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 -; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI3_0) -; RV32ID-ILP32-NEXT: flw fa5, %lo(.LCPI3_0)(a1) -; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, zero -; RV32ID-ILP32-NEXT: fmax.s fa4, fa4, fa3 -; RV32ID-ILP32-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, zero +; RV32ID-ILP32-NEXT: lui a0, %hi(.LCPI3_0) +; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 +; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32ID-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-ILP32-NEXT: addi sp, sp, 16 @@ -890,12 +890,12 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV64ID-LP64-NEXT: addi sp, sp, -16 ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 -; RV64ID-LP64-NEXT: lui a1, %hi(.LCPI3_0) -; RV64ID-LP64-NEXT: flw fa5, %lo(.LCPI3_0)(a1) -; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 -; RV64ID-LP64-NEXT: fmv.w.x fa3, zero -; RV64ID-LP64-NEXT: fmax.s fa4, fa4, fa3 -; RV64ID-LP64-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 +; RV64ID-LP64-NEXT: fmv.w.x fa4, zero +; RV64ID-LP64-NEXT: lui a0, %hi(.LCPI3_0) +; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 +; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64ID-LP64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64ID-LP64-NEXT: addi sp, sp, 16 @@ -906,11 +906,11 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV32ID-NEXT: addi sp, sp, -16 ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 +; RV32ID-NEXT: fmv.w.x fa5, zero ; RV32ID-NEXT: lui a0, %hi(.LCPI3_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV32ID-NEXT: fmv.w.x fa4, zero -; RV32ID-NEXT: fmax.s fa4, fa0, fa4 -; RV32ID-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-NEXT: fmax.s fa5, fa0, fa5 +; RV32ID-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: addi sp, sp, 16 @@ -921,11 +921,11 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV64ID-NEXT: addi sp, sp, -16 ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 +; RV64ID-NEXT: fmv.w.x fa5, zero ; RV64ID-NEXT: lui a0, %hi(.LCPI3_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV64ID-NEXT: fmv.w.x fa4, zero -; RV64ID-NEXT: fmax.s fa4, fa0, fa4 -; RV64ID-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-NEXT: fmax.s fa5, fa0, fa5 +; RV64ID-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64ID-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64ID-NEXT: addi sp, sp, 16 @@ -933,23 +933,23 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; ; CHECK32-IZFHMIN-LABEL: fcvt_ui_h_sat: ; CHECK32-IZFHMIN: # %bb.0: # %start +; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, zero ; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK32-IZFHMIN-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa4, fa0 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, zero -; CHECK32-IZFHMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz ; CHECK32-IZFHMIN-NEXT: ret ; ; CHECK64-IZFHMIN-LABEL: fcvt_ui_h_sat: ; CHECK64-IZFHMIN: # %bb.0: # %start +; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, zero ; CHECK64-IZFHMIN-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK64-IZFHMIN-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa4, fa0 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, zero -; CHECK64-IZFHMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI3_0)(a0) +; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz ; CHECK64-IZFHMIN-NEXT: ret ; @@ -2904,14 +2904,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32IZFH-NEXT: fcvt.s.h fa0, fa0 ; RV32IZFH-NEXT: lui a0, %hi(.LCPI12_0) +; RV32IZFH-NEXT: fmv.w.x fa5, zero +; RV32IZFH-NEXT: fle.s a1, fa5, fa0 ; RV32IZFH-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; RV32IZFH-NEXT: fcvt.s.h fa0, fa0 -; RV32IZFH-NEXT: fmv.w.x fa4, zero -; RV32IZFH-NEXT: fle.s a0, fa4, fa0 -; RV32IZFH-NEXT: flt.s a1, fa5, fa0 -; RV32IZFH-NEXT: neg s0, a1 -; RV32IZFH-NEXT: neg s1, a0 +; RV32IZFH-NEXT: flt.s a0, fa5, fa0 +; RV32IZFH-NEXT: neg s0, a0 +; RV32IZFH-NEXT: neg s1, a1 ; RV32IZFH-NEXT: call __fixunssfdi ; RV32IZFH-NEXT: and a0, s1, a0 ; RV32IZFH-NEXT: and a1, s1, a1 @@ -2938,14 +2938,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32IDZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IDZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32IDZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32IDZFH-NEXT: fcvt.s.h fa0, fa0 ; RV32IDZFH-NEXT: lui a0, %hi(.LCPI12_0) +; RV32IDZFH-NEXT: fmv.w.x fa5, zero +; RV32IDZFH-NEXT: fle.s a1, fa5, fa0 ; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; RV32IDZFH-NEXT: fcvt.s.h fa0, fa0 -; RV32IDZFH-NEXT: fmv.w.x fa4, zero -; RV32IDZFH-NEXT: fle.s a0, fa4, fa0 -; RV32IDZFH-NEXT: flt.s a1, fa5, fa0 -; RV32IDZFH-NEXT: neg s0, a1 -; RV32IDZFH-NEXT: neg s1, a0 +; RV32IDZFH-NEXT: flt.s a0, fa5, fa0 +; RV32IDZFH-NEXT: neg s0, a0 +; RV32IDZFH-NEXT: neg s1, a1 ; RV32IDZFH-NEXT: call __fixunssfdi ; RV32IDZFH-NEXT: and a0, s1, a0 ; RV32IDZFH-NEXT: and a1, s1, a1 @@ -3103,14 +3103,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 ; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI12_0) -; RV32ID-ILP32-NEXT: flw fa5, %lo(.LCPI12_0)(a1) -; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, zero -; RV32ID-ILP32-NEXT: fle.s a1, fa3, fa4 -; RV32ID-ILP32-NEXT: flt.s a2, fa5, fa4 -; RV32ID-ILP32-NEXT: neg s0, a2 -; RV32ID-ILP32-NEXT: neg s1, a1 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, zero +; RV32ID-ILP32-NEXT: fle.s a2, fa4, fa5 +; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI12_0)(a1) +; RV32ID-ILP32-NEXT: flt.s a1, fa4, fa5 +; RV32ID-ILP32-NEXT: neg s0, a1 +; RV32ID-ILP32-NEXT: neg s1, a2 ; RV32ID-ILP32-NEXT: call __fixunssfdi ; RV32ID-ILP32-NEXT: and a0, s1, a0 ; RV32ID-ILP32-NEXT: and a1, s1, a1 @@ -3145,12 +3145,12 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32ID-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 ; RV32ID-NEXT: lui a0, %hi(.LCPI12_0) +; RV32ID-NEXT: fmv.w.x fa5, zero +; RV32ID-NEXT: fle.s a1, fa5, fa0 ; RV32ID-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; RV32ID-NEXT: fmv.w.x fa4, zero -; RV32ID-NEXT: fle.s a0, fa4, fa0 -; RV32ID-NEXT: flt.s a1, fa5, fa0 -; RV32ID-NEXT: neg s0, a1 -; RV32ID-NEXT: neg s1, a0 +; RV32ID-NEXT: flt.s a0, fa5, fa0 +; RV32ID-NEXT: neg s0, a0 +; RV32ID-NEXT: neg s1, a1 ; RV32ID-NEXT: call __fixunssfdi ; RV32ID-NEXT: and a0, s1, a0 ; RV32ID-NEXT: and a1, s1, a1 @@ -3182,14 +3182,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; CHECK32-IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; CHECK32-IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; CHECK32-IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa0, fa0 ; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI12_0) +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa5, zero +; CHECK32-IZFHMIN-NEXT: fle.s a1, fa5, fa0 ; CHECK32-IZFHMIN-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa0, fa0 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK32-IZFHMIN-NEXT: fle.s a0, fa4, fa0 -; CHECK32-IZFHMIN-NEXT: flt.s a1, fa5, fa0 -; CHECK32-IZFHMIN-NEXT: neg s0, a1 -; CHECK32-IZFHMIN-NEXT: neg s1, a0 +; CHECK32-IZFHMIN-NEXT: flt.s a0, fa5, fa0 +; CHECK32-IZFHMIN-NEXT: neg s0, a0 +; CHECK32-IZFHMIN-NEXT: neg s1, a1 ; CHECK32-IZFHMIN-NEXT: call __fixunssfdi ; CHECK32-IZFHMIN-NEXT: and a0, s1, a0 ; CHECK32-IZFHMIN-NEXT: and a1, s1, a1 @@ -6296,13 +6296,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_w_s_sat_i16: ; RV32IZFH: # %bb.0: # %start ; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IZFH-NEXT: lui a0, %hi(.LCPI32_0) -; RV32IZFH-NEXT: feq.s a1, fa5, fa5 -; RV32IZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV32IZFH-NEXT: lui a0, 815104 -; RV32IZFH-NEXT: fmv.w.x fa3, a0 -; RV32IZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV32IZFH-NEXT: neg a0, a1 +; RV32IZFH-NEXT: lui a1, %hi(.LCPI32_0) +; RV32IZFH-NEXT: fmv.w.x fa4, a0 +; RV32IZFH-NEXT: feq.s a0, fa5, fa5 +; RV32IZFH-NEXT: neg a0, a0 +; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a1) ; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IZFH-NEXT: and a0, a0, a1 @@ -6311,13 +6311,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64IZFH-LABEL: fcvt_w_s_sat_i16: ; RV64IZFH: # %bb.0: # %start ; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IZFH-NEXT: lui a0, %hi(.LCPI32_0) -; RV64IZFH-NEXT: feq.s a1, fa5, fa5 -; RV64IZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV64IZFH-NEXT: lui a0, 815104 -; RV64IZFH-NEXT: fmv.w.x fa3, a0 -; RV64IZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV64IZFH-NEXT: neg a0, a1 +; RV64IZFH-NEXT: lui a1, %hi(.LCPI32_0) +; RV64IZFH-NEXT: fmv.w.x fa4, a0 +; RV64IZFH-NEXT: feq.s a0, fa5, fa5 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a1) ; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IZFH-NEXT: and a0, a0, a1 @@ -6326,13 +6326,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32IDZFH-LABEL: fcvt_w_s_sat_i16: ; RV32IDZFH: # %bb.0: # %start ; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IDZFH-NEXT: lui a0, %hi(.LCPI32_0) -; RV32IDZFH-NEXT: feq.s a1, fa5, fa5 -; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV32IDZFH-NEXT: lui a0, 815104 -; RV32IDZFH-NEXT: fmv.w.x fa3, a0 -; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV32IDZFH-NEXT: neg a0, a1 +; RV32IDZFH-NEXT: lui a1, %hi(.LCPI32_0) +; RV32IDZFH-NEXT: fmv.w.x fa4, a0 +; RV32IDZFH-NEXT: feq.s a0, fa5, fa5 +; RV32IDZFH-NEXT: neg a0, a0 +; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a1) ; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IDZFH-NEXT: and a0, a0, a1 @@ -6341,13 +6341,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64IDZFH-LABEL: fcvt_w_s_sat_i16: ; RV64IDZFH: # %bb.0: # %start ; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IDZFH-NEXT: lui a0, %hi(.LCPI32_0) -; RV64IDZFH-NEXT: feq.s a1, fa5, fa5 -; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV64IDZFH-NEXT: lui a0, 815104 -; RV64IDZFH-NEXT: fmv.w.x fa3, a0 -; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV64IDZFH-NEXT: neg a0, a1 +; RV64IDZFH-NEXT: lui a1, %hi(.LCPI32_0) +; RV64IDZFH-NEXT: fmv.w.x fa4, a0 +; RV64IDZFH-NEXT: feq.s a0, fa5, fa5 +; RV64IDZFH-NEXT: neg a0, a0 +; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a1) ; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IDZFH-NEXT: and a0, a0, a1 @@ -6505,13 +6505,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 ; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 -; RV32ID-ILP32-NEXT: lui a0, %hi(.LCPI32_0) -; RV32ID-ILP32-NEXT: feq.s a1, fa5, fa5 -; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV32ID-ILP32-NEXT: lui a0, 815104 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, a0 -; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa3 -; RV32ID-ILP32-NEXT: neg a0, a1 +; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI32_0) +; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 +; RV32ID-ILP32-NEXT: feq.s a0, fa5, fa5 +; RV32ID-ILP32-NEXT: neg a0, a0 +; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 +; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI32_0)(a1) ; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-ILP32-NEXT: and a0, a0, a1 @@ -6525,13 +6525,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 ; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 -; RV64ID-LP64-NEXT: lui a0, %hi(.LCPI32_0) -; RV64ID-LP64-NEXT: feq.s a1, fa5, fa5 -; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV64ID-LP64-NEXT: lui a0, 815104 -; RV64ID-LP64-NEXT: fmv.w.x fa3, a0 -; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa3 -; RV64ID-LP64-NEXT: neg a0, a1 +; RV64ID-LP64-NEXT: lui a1, %hi(.LCPI32_0) +; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 +; RV64ID-LP64-NEXT: feq.s a0, fa5, fa5 +; RV64ID-LP64-NEXT: neg a0, a0 +; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 +; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI32_0)(a1) ; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-LP64-NEXT: and a0, a0, a1 @@ -6545,13 +6545,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 ; RV32ID-NEXT: feq.s a0, fa0, fa0 -; RV32ID-NEXT: lui a1, %hi(.LCPI32_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI32_0)(a1) ; RV32ID-NEXT: lui a1, 815104 -; RV32ID-NEXT: fmv.w.x fa4, a1 -; RV32ID-NEXT: fmax.s fa4, fa0, fa4 +; RV32ID-NEXT: fmv.w.x fa5, a1 +; RV32ID-NEXT: lui a1, %hi(.LCPI32_0) ; RV32ID-NEXT: neg a0, a0 -; RV32ID-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-NEXT: fmax.s fa5, fa0, fa5 +; RV32ID-NEXT: flw fa4, %lo(.LCPI32_0)(a1) +; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-NEXT: and a0, a0, a1 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -6564,13 +6564,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 ; RV64ID-NEXT: feq.s a0, fa0, fa0 -; RV64ID-NEXT: lui a1, %hi(.LCPI32_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI32_0)(a1) ; RV64ID-NEXT: lui a1, 815104 -; RV64ID-NEXT: fmv.w.x fa4, a1 -; RV64ID-NEXT: fmax.s fa4, fa0, fa4 +; RV64ID-NEXT: fmv.w.x fa5, a1 +; RV64ID-NEXT: lui a1, %hi(.LCPI32_0) ; RV64ID-NEXT: neg a0, a0 -; RV64ID-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-NEXT: fmax.s fa5, fa0, fa5 +; RV64ID-NEXT: flw fa4, %lo(.LCPI32_0)(a1) +; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-NEXT: and a0, a0, a1 ; RV64ID-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -6580,13 +6580,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK32-IZFHMIN-LABEL: fcvt_w_s_sat_i16: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI32_0) -; CHECK32-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; CHECK32-IZFHMIN-NEXT: lui a0, 815104 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, a0 -; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK32-IZFHMIN-NEXT: neg a0, a1 +; CHECK32-IZFHMIN-NEXT: lui a1, %hi(.LCPI32_0) +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK32-IZFHMIN-NEXT: neg a0, a0 +; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI32_0)(a1) ; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz ; CHECK32-IZFHMIN-NEXT: and a0, a0, a1 @@ -6595,13 +6595,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK64-IZFHMIN-LABEL: fcvt_w_s_sat_i16: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: lui a0, %hi(.LCPI32_0) -; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; CHECK64-IZFHMIN-NEXT: lui a0, 815104 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, a0 -; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK64-IZFHMIN-NEXT: neg a0, a1 +; CHECK64-IZFHMIN-NEXT: lui a1, %hi(.LCPI32_0) +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: neg a0, a0 +; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI32_0)(a1) ; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz ; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 @@ -6816,45 +6816,45 @@ define zeroext i16 @fcvt_wu_s_i16(half %a) nounwind { define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_wu_s_sat_i16: ; RV32IZFH: # %bb.0: # %start +; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 +; RV32IZFH-NEXT: fmv.w.x fa4, zero ; RV32IZFH-NEXT: lui a0, %hi(.LCPI34_0) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV32IZFH-NEXT: fcvt.s.h fa4, fa0 -; RV32IZFH-NEXT: fmv.w.x fa3, zero -; RV32IZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV32IZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IZFH-NEXT: flw fa4, %lo(.LCPI34_0)(a0) +; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32IZFH-NEXT: ret ; ; RV64IZFH-LABEL: fcvt_wu_s_sat_i16: ; RV64IZFH: # %bb.0: # %start +; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 +; RV64IZFH-NEXT: fmv.w.x fa4, zero ; RV64IZFH-NEXT: lui a0, %hi(.LCPI34_0) -; RV64IZFH-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV64IZFH-NEXT: fcvt.s.h fa4, fa0 -; RV64IZFH-NEXT: fmv.w.x fa3, zero -; RV64IZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV64IZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IZFH-NEXT: flw fa4, %lo(.LCPI34_0)(a0) +; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64IZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_wu_s_sat_i16: ; RV32IDZFH: # %bb.0: # %start +; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 +; RV32IDZFH-NEXT: fmv.w.x fa4, zero ; RV32IDZFH-NEXT: lui a0, %hi(.LCPI34_0) -; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV32IDZFH-NEXT: fcvt.s.h fa4, fa0 -; RV32IDZFH-NEXT: fmv.w.x fa3, zero -; RV32IDZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV32IDZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI34_0)(a0) +; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32IDZFH-NEXT: ret ; ; RV64IDZFH-LABEL: fcvt_wu_s_sat_i16: ; RV64IDZFH: # %bb.0: # %start +; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 +; RV64IDZFH-NEXT: fmv.w.x fa4, zero ; RV64IDZFH-NEXT: lui a0, %hi(.LCPI34_0) -; RV64IDZFH-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV64IDZFH-NEXT: fcvt.s.h fa4, fa0 -; RV64IDZFH-NEXT: fmv.w.x fa3, zero -; RV64IDZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV64IDZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI34_0)(a0) +; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64IDZFH-NEXT: ret ; @@ -6985,12 +6985,12 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV32ID-ILP32-NEXT: addi sp, sp, -16 ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 -; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI34_0) -; RV32ID-ILP32-NEXT: flw fa5, %lo(.LCPI34_0)(a1) -; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, zero -; RV32ID-ILP32-NEXT: fmax.s fa4, fa4, fa3 -; RV32ID-ILP32-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, zero +; RV32ID-ILP32-NEXT: lui a0, %hi(.LCPI34_0) +; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 +; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI34_0)(a0) +; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32ID-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-ILP32-NEXT: addi sp, sp, 16 @@ -7001,12 +7001,12 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV64ID-LP64-NEXT: addi sp, sp, -16 ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 -; RV64ID-LP64-NEXT: lui a1, %hi(.LCPI34_0) -; RV64ID-LP64-NEXT: flw fa5, %lo(.LCPI34_0)(a1) -; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 -; RV64ID-LP64-NEXT: fmv.w.x fa3, zero -; RV64ID-LP64-NEXT: fmax.s fa4, fa4, fa3 -; RV64ID-LP64-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 +; RV64ID-LP64-NEXT: fmv.w.x fa4, zero +; RV64ID-LP64-NEXT: lui a0, %hi(.LCPI34_0) +; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 +; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI34_0)(a0) +; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64ID-LP64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64ID-LP64-NEXT: addi sp, sp, 16 @@ -7017,11 +7017,11 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV32ID-NEXT: addi sp, sp, -16 ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 +; RV32ID-NEXT: fmv.w.x fa5, zero ; RV32ID-NEXT: lui a0, %hi(.LCPI34_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV32ID-NEXT: fmv.w.x fa4, zero -; RV32ID-NEXT: fmax.s fa4, fa0, fa4 -; RV32ID-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-NEXT: fmax.s fa5, fa0, fa5 +; RV32ID-NEXT: flw fa4, %lo(.LCPI34_0)(a0) +; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: addi sp, sp, 16 @@ -7032,11 +7032,11 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV64ID-NEXT: addi sp, sp, -16 ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 +; RV64ID-NEXT: fmv.w.x fa5, zero ; RV64ID-NEXT: lui a0, %hi(.LCPI34_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV64ID-NEXT: fmv.w.x fa4, zero -; RV64ID-NEXT: fmax.s fa4, fa0, fa4 -; RV64ID-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-NEXT: fmax.s fa5, fa0, fa5 +; RV64ID-NEXT: flw fa4, %lo(.LCPI34_0)(a0) +; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64ID-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64ID-NEXT: addi sp, sp, 16 @@ -7044,23 +7044,23 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; ; CHECK32-IZFHMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK32-IZFHMIN: # %bb.0: # %start +; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, zero ; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI34_0) -; CHECK32-IZFHMIN-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa4, fa0 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, zero -; CHECK32-IZFHMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI34_0)(a0) +; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz ; CHECK32-IZFHMIN-NEXT: ret ; ; CHECK64-IZFHMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK64-IZFHMIN: # %bb.0: # %start +; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, zero ; CHECK64-IZFHMIN-NEXT: lui a0, %hi(.LCPI34_0) -; CHECK64-IZFHMIN-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa4, fa0 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, zero -; CHECK64-IZFHMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI34_0)(a0) +; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz ; CHECK64-IZFHMIN-NEXT: ret ; @@ -8595,16 +8595,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind { ; RV32IZFH-NEXT: addi sp, sp, -32 ; RV32IZFH-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32IZFH-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32IZFH-NEXT: lw a2, 0(a0) -; RV32IZFH-NEXT: lw a3, 4(a0) -; RV32IZFH-NEXT: lw a4, 8(a0) -; RV32IZFH-NEXT: lw a5, 12(a0) ; RV32IZFH-NEXT: mv s0, a1 +; RV32IZFH-NEXT: lw a1, 0(a0) +; RV32IZFH-NEXT: lw a2, 4(a0) +; RV32IZFH-NEXT: lw a3, 8(a0) +; RV32IZFH-NEXT: lw a0, 12(a0) +; RV32IZFH-NEXT: sw a1, 8(sp) +; RV32IZFH-NEXT: sw a2, 12(sp) +; RV32IZFH-NEXT: sw a3, 16(sp) +; RV32IZFH-NEXT: sw a0, 20(sp) ; RV32IZFH-NEXT: addi a0, sp, 8 -; RV32IZFH-NEXT: sw a2, 8(sp) -; RV32IZFH-NEXT: sw a3, 12(sp) -; RV32IZFH-NEXT: sw a4, 16(sp) -; RV32IZFH-NEXT: sw a5, 20(sp) ; RV32IZFH-NEXT: call __trunctfhf2 ; RV32IZFH-NEXT: fsh fa0, 0(s0) ; RV32IZFH-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -8630,16 +8630,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind { ; RV32IDZFH-NEXT: addi sp, sp, -32 ; RV32IDZFH-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32IDZFH-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32IDZFH-NEXT: lw a2, 0(a0) -; RV32IDZFH-NEXT: lw a3, 4(a0) -; RV32IDZFH-NEXT: lw a4, 8(a0) -; RV32IDZFH-NEXT: lw a5, 12(a0) ; RV32IDZFH-NEXT: mv s0, a1 +; RV32IDZFH-NEXT: lw a1, 0(a0) +; RV32IDZFH-NEXT: lw a2, 4(a0) +; RV32IDZFH-NEXT: lw a3, 8(a0) +; RV32IDZFH-NEXT: lw a0, 12(a0) +; RV32IDZFH-NEXT: sw a1, 8(sp) +; RV32IDZFH-NEXT: sw a2, 12(sp) +; RV32IDZFH-NEXT: sw a3, 16(sp) +; RV32IDZFH-NEXT: sw a0, 20(sp) ; RV32IDZFH-NEXT: addi a0, sp, 8 -; RV32IDZFH-NEXT: sw a2, 8(sp) -; RV32IDZFH-NEXT: sw a3, 12(sp) -; RV32IDZFH-NEXT: sw a4, 16(sp) -; RV32IDZFH-NEXT: sw a5, 20(sp) ; RV32IDZFH-NEXT: call __trunctfhf2 ; RV32IDZFH-NEXT: fsh fa0, 0(s0) ; RV32IDZFH-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -8665,16 +8665,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind { ; RV32IZHINX-NEXT: addi sp, sp, -32 ; RV32IZHINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32IZHINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32IZHINX-NEXT: lw a2, 0(a0) -; RV32IZHINX-NEXT: lw a3, 4(a0) -; RV32IZHINX-NEXT: lw a4, 8(a0) -; RV32IZHINX-NEXT: lw a5, 12(a0) ; RV32IZHINX-NEXT: mv s0, a1 +; RV32IZHINX-NEXT: lw a1, 0(a0) +; RV32IZHINX-NEXT: lw a2, 4(a0) +; RV32IZHINX-NEXT: lw a3, 8(a0) +; RV32IZHINX-NEXT: lw a0, 12(a0) +; RV32IZHINX-NEXT: sw a1, 8(sp) +; RV32IZHINX-NEXT: sw a2, 12(sp) +; RV32IZHINX-NEXT: sw a3, 16(sp) +; RV32IZHINX-NEXT: sw a0, 20(sp) ; RV32IZHINX-NEXT: addi a0, sp, 8 -; RV32IZHINX-NEXT: sw a2, 8(sp) -; RV32IZHINX-NEXT: sw a3, 12(sp) -; RV32IZHINX-NEXT: sw a4, 16(sp) -; RV32IZHINX-NEXT: sw a5, 20(sp) ; RV32IZHINX-NEXT: call __trunctfhf2 ; RV32IZHINX-NEXT: sh a0, 0(s0) ; RV32IZHINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -8700,16 +8700,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind { ; RV32IZDINXZHINX-NEXT: addi sp, sp, -32 ; RV32IZDINXZHINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32IZDINXZHINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32IZDINXZHINX-NEXT: lw a2, 0(a0) -; RV32IZDINXZHINX-NEXT: lw a3, 4(a0) -; RV32IZDINXZHINX-NEXT: lw a4, 8(a0) -; RV32IZDINXZHINX-NEXT: lw a5, 12(a0) ; RV32IZDINXZHINX-NEXT: mv s0, a1 +; RV32IZDINXZHINX-NEXT: lw a1, 0(a0) +; RV32IZDINXZHINX-NEXT: lw a2, 4(a0) +; RV32IZDINXZHINX-NEXT: lw a3, 8(a0) +; RV32IZDINXZHINX-NEXT: lw a0, 12(a0) +; RV32IZDINXZHINX-NEXT: sw a1, 8(sp) +; RV32IZDINXZHINX-NEXT: sw a2, 12(sp) +; RV32IZDINXZHINX-NEXT: sw a3, 16(sp) +; RV32IZDINXZHINX-NEXT: sw a0, 20(sp) ; RV32IZDINXZHINX-NEXT: addi a0, sp, 8 -; RV32IZDINXZHINX-NEXT: sw a2, 8(sp) -; RV32IZDINXZHINX-NEXT: sw a3, 12(sp) -; RV32IZDINXZHINX-NEXT: sw a4, 16(sp) -; RV32IZDINXZHINX-NEXT: sw a5, 20(sp) ; RV32IZDINXZHINX-NEXT: call __trunctfhf2 ; RV32IZDINXZHINX-NEXT: sh a0, 0(s0) ; RV32IZDINXZHINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -8735,16 +8735,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind { ; RV32I-NEXT: addi sp, sp, -32 ; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a2, 0(a0) -; RV32I-NEXT: lw a3, 4(a0) -; RV32I-NEXT: lw a4, 8(a0) -; RV32I-NEXT: lw a5, 12(a0) ; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: lw a1, 0(a0) +; RV32I-NEXT: lw a2, 4(a0) +; RV32I-NEXT: lw a3, 8(a0) +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: sw a1, 8(sp) +; RV32I-NEXT: sw a2, 12(sp) +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: addi a0, sp, 8 -; RV32I-NEXT: sw a2, 8(sp) -; RV32I-NEXT: sw a3, 12(sp) -; RV32I-NEXT: sw a4, 16(sp) -; RV32I-NEXT: sw a5, 20(sp) ; RV32I-NEXT: call __trunctfhf2 ; RV32I-NEXT: sh a0, 0(s0) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -8770,16 +8770,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind { ; RV32ID-ILP32-NEXT: addi sp, sp, -32 ; RV32ID-ILP32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32ID-ILP32-NEXT: lw a2, 0(a0) -; RV32ID-ILP32-NEXT: lw a3, 4(a0) -; RV32ID-ILP32-NEXT: lw a4, 8(a0) -; RV32ID-ILP32-NEXT: lw a5, 12(a0) ; RV32ID-ILP32-NEXT: mv s0, a1 +; RV32ID-ILP32-NEXT: lw a1, 0(a0) +; RV32ID-ILP32-NEXT: lw a2, 4(a0) +; RV32ID-ILP32-NEXT: lw a3, 8(a0) +; RV32ID-ILP32-NEXT: lw a0, 12(a0) +; RV32ID-ILP32-NEXT: sw a1, 8(sp) +; RV32ID-ILP32-NEXT: sw a2, 12(sp) +; RV32ID-ILP32-NEXT: sw a3, 16(sp) +; RV32ID-ILP32-NEXT: sw a0, 20(sp) ; RV32ID-ILP32-NEXT: addi a0, sp, 8 -; RV32ID-ILP32-NEXT: sw a2, 8(sp) -; RV32ID-ILP32-NEXT: sw a3, 12(sp) -; RV32ID-ILP32-NEXT: sw a4, 16(sp) -; RV32ID-ILP32-NEXT: sw a5, 20(sp) ; RV32ID-ILP32-NEXT: call __trunctfhf2 ; RV32ID-ILP32-NEXT: sh a0, 0(s0) ; RV32ID-ILP32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -8805,16 +8805,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind { ; RV32ID-NEXT: addi sp, sp, -32 ; RV32ID-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; RV32ID-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32ID-NEXT: lw a2, 0(a0) -; RV32ID-NEXT: lw a3, 4(a0) -; RV32ID-NEXT: lw a4, 8(a0) -; RV32ID-NEXT: lw a5, 12(a0) ; RV32ID-NEXT: mv s0, a1 +; RV32ID-NEXT: lw a1, 0(a0) +; RV32ID-NEXT: lw a2, 4(a0) +; RV32ID-NEXT: lw a3, 8(a0) +; RV32ID-NEXT: lw a0, 12(a0) +; RV32ID-NEXT: sw a1, 8(sp) +; RV32ID-NEXT: sw a2, 12(sp) +; RV32ID-NEXT: sw a3, 16(sp) +; RV32ID-NEXT: sw a0, 20(sp) ; RV32ID-NEXT: addi a0, sp, 8 -; RV32ID-NEXT: sw a2, 8(sp) -; RV32ID-NEXT: sw a3, 12(sp) -; RV32ID-NEXT: sw a4, 16(sp) -; RV32ID-NEXT: sw a5, 20(sp) ; RV32ID-NEXT: call __trunctfhf2 ; RV32ID-NEXT: fmv.x.w a0, fa0 ; RV32ID-NEXT: sh a0, 0(s0) @@ -8842,16 +8842,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind { ; CHECK32-IZFHMIN-NEXT: addi sp, sp, -32 ; CHECK32-IZFHMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; CHECK32-IZFHMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; CHECK32-IZFHMIN-NEXT: lw a2, 0(a0) -; CHECK32-IZFHMIN-NEXT: lw a3, 4(a0) -; CHECK32-IZFHMIN-NEXT: lw a4, 8(a0) -; CHECK32-IZFHMIN-NEXT: lw a5, 12(a0) ; CHECK32-IZFHMIN-NEXT: mv s0, a1 +; CHECK32-IZFHMIN-NEXT: lw a1, 0(a0) +; CHECK32-IZFHMIN-NEXT: lw a2, 4(a0) +; CHECK32-IZFHMIN-NEXT: lw a3, 8(a0) +; CHECK32-IZFHMIN-NEXT: lw a0, 12(a0) +; CHECK32-IZFHMIN-NEXT: sw a1, 8(sp) +; CHECK32-IZFHMIN-NEXT: sw a2, 12(sp) +; CHECK32-IZFHMIN-NEXT: sw a3, 16(sp) +; CHECK32-IZFHMIN-NEXT: sw a0, 20(sp) ; CHECK32-IZFHMIN-NEXT: addi a0, sp, 8 -; CHECK32-IZFHMIN-NEXT: sw a2, 8(sp) -; CHECK32-IZFHMIN-NEXT: sw a3, 12(sp) -; CHECK32-IZFHMIN-NEXT: sw a4, 16(sp) -; CHECK32-IZFHMIN-NEXT: sw a5, 20(sp) ; CHECK32-IZFHMIN-NEXT: call __trunctfhf2 ; CHECK32-IZFHMIN-NEXT: fsh fa0, 0(s0) ; CHECK32-IZFHMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -8877,16 +8877,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind { ; CHECK32-IZHINXMIN-NEXT: addi sp, sp, -32 ; CHECK32-IZHINXMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; CHECK32-IZHINXMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; CHECK32-IZHINXMIN-NEXT: lw a2, 0(a0) -; CHECK32-IZHINXMIN-NEXT: lw a3, 4(a0) -; CHECK32-IZHINXMIN-NEXT: lw a4, 8(a0) -; CHECK32-IZHINXMIN-NEXT: lw a5, 12(a0) ; CHECK32-IZHINXMIN-NEXT: mv s0, a1 +; CHECK32-IZHINXMIN-NEXT: lw a1, 0(a0) +; CHECK32-IZHINXMIN-NEXT: lw a2, 4(a0) +; CHECK32-IZHINXMIN-NEXT: lw a3, 8(a0) +; CHECK32-IZHINXMIN-NEXT: lw a0, 12(a0) +; CHECK32-IZHINXMIN-NEXT: sw a1, 8(sp) +; CHECK32-IZHINXMIN-NEXT: sw a2, 12(sp) +; CHECK32-IZHINXMIN-NEXT: sw a3, 16(sp) +; CHECK32-IZHINXMIN-NEXT: sw a0, 20(sp) ; CHECK32-IZHINXMIN-NEXT: addi a0, sp, 8 -; CHECK32-IZHINXMIN-NEXT: sw a2, 8(sp) -; CHECK32-IZHINXMIN-NEXT: sw a3, 12(sp) -; CHECK32-IZHINXMIN-NEXT: sw a4, 16(sp) -; CHECK32-IZHINXMIN-NEXT: sw a5, 20(sp) ; CHECK32-IZHINXMIN-NEXT: call __trunctfhf2 ; CHECK32-IZHINXMIN-NEXT: sh a0, 0(s0) ; CHECK32-IZHINXMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -8912,16 +8912,16 @@ define void @fcvt_h_q(fp128 %x, ptr %y) nounwind { ; CHECK32-IZDINXZHINXMIN-NEXT: addi sp, sp, -32 ; CHECK32-IZDINXZHINXMIN-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; CHECK32-IZDINXZHINXMIN-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; CHECK32-IZDINXZHINXMIN-NEXT: lw a2, 0(a0) -; CHECK32-IZDINXZHINXMIN-NEXT: lw a3, 4(a0) -; CHECK32-IZDINXZHINXMIN-NEXT: lw a4, 8(a0) -; CHECK32-IZDINXZHINXMIN-NEXT: lw a5, 12(a0) ; CHECK32-IZDINXZHINXMIN-NEXT: mv s0, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: lw a1, 0(a0) +; CHECK32-IZDINXZHINXMIN-NEXT: lw a2, 4(a0) +; CHECK32-IZDINXZHINXMIN-NEXT: lw a3, 8(a0) +; CHECK32-IZDINXZHINXMIN-NEXT: lw a0, 12(a0) +; CHECK32-IZDINXZHINXMIN-NEXT: sw a1, 8(sp) +; CHECK32-IZDINXZHINXMIN-NEXT: sw a2, 12(sp) +; CHECK32-IZDINXZHINXMIN-NEXT: sw a3, 16(sp) +; CHECK32-IZDINXZHINXMIN-NEXT: sw a0, 20(sp) ; CHECK32-IZDINXZHINXMIN-NEXT: addi a0, sp, 8 -; CHECK32-IZDINXZHINXMIN-NEXT: sw a2, 8(sp) -; CHECK32-IZDINXZHINXMIN-NEXT: sw a3, 12(sp) -; CHECK32-IZDINXZHINXMIN-NEXT: sw a4, 16(sp) -; CHECK32-IZDINXZHINXMIN-NEXT: sw a5, 20(sp) ; CHECK32-IZDINXZHINXMIN-NEXT: call __trunctfhf2 ; CHECK32-IZDINXZHINXMIN-NEXT: sh a0, 0(s0) ; CHECK32-IZDINXZHINXMIN-NEXT: lw ra, 28(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll index 12cf088e3205f..7754f5b8f9f3a 100644 --- a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll +++ b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll @@ -222,8 +222,8 @@ define i32 @fcmp_one(half %a, half %b) nounwind strictfp { ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: flt.h a2, fa1, fa0 ; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: or a0, a2, a1 ; CHECK-NEXT: feq.h zero, fa1, fa0 +; CHECK-NEXT: or a0, a2, a1 ; CHECK-NEXT: ret ; ; CHECKIZHINX-LABEL: fcmp_one: @@ -235,9 +235,8 @@ define i32 @fcmp_one(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: frflags a2 ; CHECKIZHINX-NEXT: flt.h a4, a1, a0 ; CHECKIZHINX-NEXT: fsflags a2 -; CHECKIZHINX-NEXT: or a2, a4, a3 ; CHECKIZHINX-NEXT: feq.h zero, a1, a0 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: or a0, a4, a3 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_one: @@ -249,23 +248,23 @@ define i32 @fcmp_one(half %a, half %b) nounwind strictfp { ; CHECKIZFHMIN-NEXT: frflags a0 ; CHECKIZFHMIN-NEXT: flt.h a2, fa1, fa0 ; CHECKIZFHMIN-NEXT: fsflags a0 -; CHECKIZFHMIN-NEXT: or a0, a2, a1 ; CHECKIZFHMIN-NEXT: feq.h zero, fa1, fa0 +; CHECKIZFHMIN-NEXT: or a0, a2, a1 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fcmp_one: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a0 -; CHECKIZHINXMIN-NEXT: frflags a0 -; CHECKIZHINXMIN-NEXT: flt.s a3, a2, a1 -; CHECKIZHINXMIN-NEXT: fsflags a0 -; CHECKIZHINXMIN-NEXT: feq.s zero, a2, a1 -; CHECKIZHINXMIN-NEXT: frflags a0 -; CHECKIZHINXMIN-NEXT: flt.s a4, a1, a2 -; CHECKIZHINXMIN-NEXT: fsflags a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: frflags a2 +; CHECKIZHINXMIN-NEXT: flt.s a3, a0, a1 +; CHECKIZHINXMIN-NEXT: fsflags a2 +; CHECKIZHINXMIN-NEXT: feq.s zero, a0, a1 +; CHECKIZHINXMIN-NEXT: frflags a2 +; CHECKIZHINXMIN-NEXT: flt.s a4, a1, a0 +; CHECKIZHINXMIN-NEXT: fsflags a2 +; CHECKIZHINXMIN-NEXT: feq.s zero, a1, a0 ; CHECKIZHINXMIN-NEXT: or a0, a4, a3 -; CHECKIZHINXMIN-NEXT: feq.s zero, a1, a2 ; CHECKIZHINXMIN-NEXT: ret %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"one", metadata !"fpexcept.strict") strictfp %2 = zext i1 %1 to i32 @@ -319,9 +318,9 @@ define i32 @fcmp_ueq(half %a, half %b) nounwind strictfp { ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: flt.h a2, fa1, fa0 ; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: feq.h zero, fa1, fa0 ; CHECK-NEXT: or a1, a2, a1 ; CHECK-NEXT: xori a0, a1, 1 -; CHECK-NEXT: feq.h zero, fa1, fa0 ; CHECK-NEXT: ret ; ; CHECKIZHINX-LABEL: fcmp_ueq: @@ -333,10 +332,9 @@ define i32 @fcmp_ueq(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: frflags a2 ; CHECKIZHINX-NEXT: flt.h a4, a1, a0 ; CHECKIZHINX-NEXT: fsflags a2 -; CHECKIZHINX-NEXT: or a3, a4, a3 -; CHECKIZHINX-NEXT: xori a2, a3, 1 ; CHECKIZHINX-NEXT: feq.h zero, a1, a0 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: or a3, a4, a3 +; CHECKIZHINX-NEXT: xori a0, a3, 1 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_ueq: @@ -348,25 +346,25 @@ define i32 @fcmp_ueq(half %a, half %b) nounwind strictfp { ; CHECKIZFHMIN-NEXT: frflags a0 ; CHECKIZFHMIN-NEXT: flt.h a2, fa1, fa0 ; CHECKIZFHMIN-NEXT: fsflags a0 +; CHECKIZFHMIN-NEXT: feq.h zero, fa1, fa0 ; CHECKIZFHMIN-NEXT: or a1, a2, a1 ; CHECKIZFHMIN-NEXT: xori a0, a1, 1 -; CHECKIZFHMIN-NEXT: feq.h zero, fa1, fa0 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fcmp_ueq: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a0 -; CHECKIZHINXMIN-NEXT: frflags a0 -; CHECKIZHINXMIN-NEXT: flt.s a3, a2, a1 -; CHECKIZHINXMIN-NEXT: fsflags a0 -; CHECKIZHINXMIN-NEXT: feq.s zero, a2, a1 -; CHECKIZHINXMIN-NEXT: frflags a0 -; CHECKIZHINXMIN-NEXT: flt.s a4, a1, a2 -; CHECKIZHINXMIN-NEXT: fsflags a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: frflags a2 +; CHECKIZHINXMIN-NEXT: flt.s a3, a0, a1 +; CHECKIZHINXMIN-NEXT: fsflags a2 +; CHECKIZHINXMIN-NEXT: feq.s zero, a0, a1 +; CHECKIZHINXMIN-NEXT: frflags a2 +; CHECKIZHINXMIN-NEXT: flt.s a4, a1, a0 +; CHECKIZHINXMIN-NEXT: fsflags a2 +; CHECKIZHINXMIN-NEXT: feq.s zero, a1, a0 ; CHECKIZHINXMIN-NEXT: or a3, a4, a3 ; CHECKIZHINXMIN-NEXT: xori a0, a3, 1 -; CHECKIZHINXMIN-NEXT: feq.s zero, a1, a2 ; CHECKIZHINXMIN-NEXT: ret %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ueq", metadata !"fpexcept.strict") strictfp %2 = zext i1 %1 to i32 @@ -379,8 +377,8 @@ define i32 @fcmp_ugt(half %a, half %b) nounwind strictfp { ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: fle.h a1, fa0, fa1 ; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: xori a0, a1, 1 ; CHECK-NEXT: feq.h zero, fa0, fa1 +; CHECK-NEXT: xori a0, a1, 1 ; CHECK-NEXT: ret ; ; CHECKIZHINX-LABEL: fcmp_ugt: @@ -388,9 +386,8 @@ define i32 @fcmp_ugt(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: frflags a2 ; CHECKIZHINX-NEXT: fle.h a3, a0, a1 ; CHECKIZHINX-NEXT: fsflags a2 -; CHECKIZHINX-NEXT: xori a2, a3, 1 ; CHECKIZHINX-NEXT: feq.h zero, a0, a1 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: xori a0, a3, 1 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_ugt: @@ -398,19 +395,19 @@ define i32 @fcmp_ugt(half %a, half %b) nounwind strictfp { ; CHECKIZFHMIN-NEXT: frflags a0 ; CHECKIZFHMIN-NEXT: fle.h a1, fa0, fa1 ; CHECKIZFHMIN-NEXT: fsflags a0 -; CHECKIZFHMIN-NEXT: xori a0, a1, 1 ; CHECKIZFHMIN-NEXT: feq.h zero, fa0, fa1 +; CHECKIZFHMIN-NEXT: xori a0, a1, 1 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fcmp_ugt: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a0 -; CHECKIZHINXMIN-NEXT: frflags a0 -; CHECKIZHINXMIN-NEXT: fle.s a3, a2, a1 -; CHECKIZHINXMIN-NEXT: fsflags a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: frflags a2 +; CHECKIZHINXMIN-NEXT: fle.s a3, a0, a1 +; CHECKIZHINXMIN-NEXT: fsflags a2 +; CHECKIZHINXMIN-NEXT: feq.s zero, a0, a1 ; CHECKIZHINXMIN-NEXT: xori a0, a3, 1 -; CHECKIZHINXMIN-NEXT: feq.s zero, a2, a1 ; CHECKIZHINXMIN-NEXT: ret %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ugt", metadata !"fpexcept.strict") strictfp %2 = zext i1 %1 to i32 @@ -423,8 +420,8 @@ define i32 @fcmp_uge(half %a, half %b) nounwind strictfp { ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: flt.h a1, fa0, fa1 ; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: xori a0, a1, 1 ; CHECK-NEXT: feq.h zero, fa0, fa1 +; CHECK-NEXT: xori a0, a1, 1 ; CHECK-NEXT: ret ; ; CHECKIZHINX-LABEL: fcmp_uge: @@ -432,9 +429,8 @@ define i32 @fcmp_uge(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: frflags a2 ; CHECKIZHINX-NEXT: flt.h a3, a0, a1 ; CHECKIZHINX-NEXT: fsflags a2 -; CHECKIZHINX-NEXT: xori a2, a3, 1 ; CHECKIZHINX-NEXT: feq.h zero, a0, a1 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: xori a0, a3, 1 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_uge: @@ -442,19 +438,19 @@ define i32 @fcmp_uge(half %a, half %b) nounwind strictfp { ; CHECKIZFHMIN-NEXT: frflags a0 ; CHECKIZFHMIN-NEXT: flt.h a1, fa0, fa1 ; CHECKIZFHMIN-NEXT: fsflags a0 -; CHECKIZFHMIN-NEXT: xori a0, a1, 1 ; CHECKIZFHMIN-NEXT: feq.h zero, fa0, fa1 +; CHECKIZFHMIN-NEXT: xori a0, a1, 1 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fcmp_uge: ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a0 -; CHECKIZHINXMIN-NEXT: frflags a0 -; CHECKIZHINXMIN-NEXT: flt.s a3, a2, a1 -; CHECKIZHINXMIN-NEXT: fsflags a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 +; CHECKIZHINXMIN-NEXT: frflags a2 +; CHECKIZHINXMIN-NEXT: flt.s a3, a0, a1 +; CHECKIZHINXMIN-NEXT: fsflags a2 +; CHECKIZHINXMIN-NEXT: feq.s zero, a0, a1 ; CHECKIZHINXMIN-NEXT: xori a0, a3, 1 -; CHECKIZHINXMIN-NEXT: feq.s zero, a2, a1 ; CHECKIZHINXMIN-NEXT: ret %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"uge", metadata !"fpexcept.strict") strictfp %2 = zext i1 %1 to i32 @@ -467,8 +463,8 @@ define i32 @fcmp_ult(half %a, half %b) nounwind strictfp { ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: fle.h a1, fa1, fa0 ; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: xori a0, a1, 1 ; CHECK-NEXT: feq.h zero, fa1, fa0 +; CHECK-NEXT: xori a0, a1, 1 ; CHECK-NEXT: ret ; ; CHECKIZHINX-LABEL: fcmp_ult: @@ -476,9 +472,8 @@ define i32 @fcmp_ult(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: frflags a2 ; CHECKIZHINX-NEXT: fle.h a3, a1, a0 ; CHECKIZHINX-NEXT: fsflags a2 -; CHECKIZHINX-NEXT: xori a2, a3, 1 ; CHECKIZHINX-NEXT: feq.h zero, a1, a0 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: xori a0, a3, 1 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_ult: @@ -486,19 +481,19 @@ define i32 @fcmp_ult(half %a, half %b) nounwind strictfp { ; CHECKIZFHMIN-NEXT: frflags a0 ; CHECKIZFHMIN-NEXT: fle.h a1, fa1, fa0 ; CHECKIZFHMIN-NEXT: fsflags a0 -; CHECKIZFHMIN-NEXT: xori a0, a1, 1 ; CHECKIZFHMIN-NEXT: feq.h zero, fa1, fa0 +; CHECKIZFHMIN-NEXT: xori a0, a1, 1 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fcmp_ult: ; CHECKIZHINXMIN: # %bb.0: -; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: frflags a0 -; CHECKIZHINXMIN-NEXT: fle.s a3, a1, a2 -; CHECKIZHINXMIN-NEXT: fsflags a0 +; CHECKIZHINXMIN-NEXT: frflags a2 +; CHECKIZHINXMIN-NEXT: fle.s a3, a1, a0 +; CHECKIZHINXMIN-NEXT: fsflags a2 +; CHECKIZHINXMIN-NEXT: feq.s zero, a1, a0 ; CHECKIZHINXMIN-NEXT: xori a0, a3, 1 -; CHECKIZHINXMIN-NEXT: feq.s zero, a1, a2 ; CHECKIZHINXMIN-NEXT: ret %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ult", metadata !"fpexcept.strict") strictfp %2 = zext i1 %1 to i32 @@ -511,8 +506,8 @@ define i32 @fcmp_ule(half %a, half %b) nounwind strictfp { ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: flt.h a1, fa1, fa0 ; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: xori a0, a1, 1 ; CHECK-NEXT: feq.h zero, fa1, fa0 +; CHECK-NEXT: xori a0, a1, 1 ; CHECK-NEXT: ret ; ; CHECKIZHINX-LABEL: fcmp_ule: @@ -520,9 +515,8 @@ define i32 @fcmp_ule(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: frflags a2 ; CHECKIZHINX-NEXT: flt.h a3, a1, a0 ; CHECKIZHINX-NEXT: fsflags a2 -; CHECKIZHINX-NEXT: xori a2, a3, 1 ; CHECKIZHINX-NEXT: feq.h zero, a1, a0 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: xori a0, a3, 1 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_ule: @@ -530,19 +524,19 @@ define i32 @fcmp_ule(half %a, half %b) nounwind strictfp { ; CHECKIZFHMIN-NEXT: frflags a0 ; CHECKIZFHMIN-NEXT: flt.h a1, fa1, fa0 ; CHECKIZFHMIN-NEXT: fsflags a0 -; CHECKIZFHMIN-NEXT: xori a0, a1, 1 ; CHECKIZFHMIN-NEXT: feq.h zero, fa1, fa0 +; CHECKIZFHMIN-NEXT: xori a0, a1, 1 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fcmp_ule: ; CHECKIZHINXMIN: # %bb.0: -; CHECKIZHINXMIN-NEXT: fcvt.s.h a2, a0 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 -; CHECKIZHINXMIN-NEXT: frflags a0 -; CHECKIZHINXMIN-NEXT: flt.s a3, a1, a2 -; CHECKIZHINXMIN-NEXT: fsflags a0 +; CHECKIZHINXMIN-NEXT: frflags a2 +; CHECKIZHINXMIN-NEXT: flt.s a3, a1, a0 +; CHECKIZHINXMIN-NEXT: fsflags a2 +; CHECKIZHINXMIN-NEXT: feq.s zero, a1, a0 ; CHECKIZHINXMIN-NEXT: xori a0, a3, 1 -; CHECKIZHINXMIN-NEXT: feq.s zero, a1, a2 ; CHECKIZHINXMIN-NEXT: ret %1 = call i1 @llvm.experimental.constrained.fcmp.f16(half %a, half %b, metadata !"ule", metadata !"fpexcept.strict") strictfp %2 = zext i1 %1 to i32 diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll index 7fcad77c7c17b..5d5f58278235c 100644 --- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll @@ -3439,8 +3439,8 @@ define {half, i32} @frexp_half(half %x) nounwind { ; RV32IZFH-NEXT: fcvt.s.h fa0, fa0 ; RV32IZFH-NEXT: addi a0, sp, 8 ; RV32IZFH-NEXT: call frexpf -; RV32IZFH-NEXT: lw a0, 8(sp) ; RV32IZFH-NEXT: fcvt.h.s fa0, fa0 +; RV32IZFH-NEXT: lw a0, 8(sp) ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: addi sp, sp, 16 ; RV32IZFH-NEXT: ret @@ -3452,8 +3452,8 @@ define {half, i32} @frexp_half(half %x) nounwind { ; RV64IZFH-NEXT: fcvt.s.h fa0, fa0 ; RV64IZFH-NEXT: mv a0, sp ; RV64IZFH-NEXT: call frexpf -; RV64IZFH-NEXT: ld a0, 0(sp) ; RV64IZFH-NEXT: fcvt.h.s fa0, fa0 +; RV64IZFH-NEXT: ld a0, 0(sp) ; RV64IZFH-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64IZFH-NEXT: addi sp, sp, 16 ; RV64IZFH-NEXT: ret @@ -3465,8 +3465,8 @@ define {half, i32} @frexp_half(half %x) nounwind { ; RV32IZHINX-NEXT: fcvt.s.h a0, a0 ; RV32IZHINX-NEXT: addi a1, sp, 8 ; RV32IZHINX-NEXT: call frexpf -; RV32IZHINX-NEXT: lw a1, 8(sp) ; RV32IZHINX-NEXT: fcvt.h.s a0, a0 +; RV32IZHINX-NEXT: lw a1, 8(sp) ; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINX-NEXT: addi sp, sp, 16 ; RV32IZHINX-NEXT: ret @@ -3478,8 +3478,8 @@ define {half, i32} @frexp_half(half %x) nounwind { ; RV64IZHINX-NEXT: fcvt.s.h a0, a0 ; RV64IZHINX-NEXT: mv a1, sp ; RV64IZHINX-NEXT: call frexpf -; RV64IZHINX-NEXT: ld a1, 0(sp) ; RV64IZHINX-NEXT: fcvt.h.s a0, a0 +; RV64IZHINX-NEXT: ld a1, 0(sp) ; RV64IZHINX-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64IZHINX-NEXT: addi sp, sp, 16 ; RV64IZHINX-NEXT: ret @@ -3521,8 +3521,8 @@ define {half, i32} @frexp_half(half %x) nounwind { ; RV32IZFHMIN-NEXT: fcvt.s.h fa0, fa0 ; RV32IZFHMIN-NEXT: addi a0, sp, 8 ; RV32IZFHMIN-NEXT: call frexpf -; RV32IZFHMIN-NEXT: lw a0, 8(sp) ; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa0 +; RV32IZFHMIN-NEXT: lw a0, 8(sp) ; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFHMIN-NEXT: addi sp, sp, 16 ; RV32IZFHMIN-NEXT: ret @@ -3534,8 +3534,8 @@ define {half, i32} @frexp_half(half %x) nounwind { ; RV64IZFHMIN-NEXT: fcvt.s.h fa0, fa0 ; RV64IZFHMIN-NEXT: mv a0, sp ; RV64IZFHMIN-NEXT: call frexpf -; RV64IZFHMIN-NEXT: ld a0, 0(sp) ; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa0 +; RV64IZFHMIN-NEXT: ld a0, 0(sp) ; RV64IZFHMIN-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64IZFHMIN-NEXT: addi sp, sp, 16 ; RV64IZFHMIN-NEXT: ret @@ -3547,8 +3547,8 @@ define {half, i32} @frexp_half(half %x) nounwind { ; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; RV32IZHINXMIN-NEXT: addi a1, sp, 8 ; RV32IZHINXMIN-NEXT: call frexpf -; RV32IZHINXMIN-NEXT: lw a1, 8(sp) ; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV32IZHINXMIN-NEXT: lw a1, 8(sp) ; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZHINXMIN-NEXT: addi sp, sp, 16 ; RV32IZHINXMIN-NEXT: ret @@ -3560,8 +3560,8 @@ define {half, i32} @frexp_half(half %x) nounwind { ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 ; RV64IZHINXMIN-NEXT: mv a1, sp ; RV64IZHINXMIN-NEXT: call frexpf -; RV64IZHINXMIN-NEXT: ld a1, 0(sp) ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 +; RV64IZHINXMIN-NEXT: ld a1, 0(sp) ; RV64IZHINXMIN-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64IZHINXMIN-NEXT: addi sp, sp, 16 ; RV64IZHINXMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/half-mem.ll b/llvm/test/CodeGen/RISCV/half-mem.ll index 9ac2a4d037f8a..a910bb9eec875 100644 --- a/llvm/test/CodeGen/RISCV/half-mem.ll +++ b/llvm/test/CodeGen/RISCV/half-mem.ll @@ -33,21 +33,21 @@ define half @flh(ptr %a) nounwind { ; ; CHECKIZFHMIN-LABEL: flh: ; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: flh fa5, 6(a0) -; CHECKIZFHMIN-NEXT: flh fa4, 0(a0) -; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; CHECKIZFHMIN-NEXT: flh fa5, 0(a0) +; CHECKIZFHMIN-NEXT: flh fa4, 6(a0) ; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa4 -; CHECKIZFHMIN-NEXT: fadd.s fa5, fa4, fa5 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; CHECKIZFHMIN-NEXT: fadd.s fa5, fa5, fa4 ; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: flh: ; CHECKIZHINXMIN: # %bb.0: -; CHECKIZHINXMIN-NEXT: lh a1, 6(a0) -; CHECKIZHINXMIN-NEXT: lh a0, 0(a0) -; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: lh a1, 0(a0) +; CHECKIZHINXMIN-NEXT: lh a0, 6(a0) ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: fadd.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 +; CHECKIZHINXMIN-NEXT: fadd.s a0, a1, a0 ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: ret %1 = load half, ptr %a diff --git a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll index d92dcb9eac4c6..9aff2d434689f 100644 --- a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll @@ -737,12 +737,12 @@ define i32 @i32_select_fcmp_oeq(half %a, half %b, i32 %c, i32 %d) nounwind { ; ; CHECKIZHINX-LABEL: i32_select_fcmp_oeq: ; CHECKIZHINX: # %bb.0: -; CHECKIZHINX-NEXT: feq.h a1, a0, a1 -; CHECKIZHINX-NEXT: mv a0, a2 -; CHECKIZHINX-NEXT: bnez a1, .LBB16_2 +; CHECKIZHINX-NEXT: feq.h a0, a0, a1 +; CHECKIZHINX-NEXT: bnez a0, .LBB16_2 ; CHECKIZHINX-NEXT: # %bb.1: -; CHECKIZHINX-NEXT: mv a0, a3 +; CHECKIZHINX-NEXT: mv a2, a3 ; CHECKIZHINX-NEXT: .LBB16_2: +; CHECKIZHINX-NEXT: mv a0, a2 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: i32_select_fcmp_oeq: @@ -760,12 +760,12 @@ define i32 @i32_select_fcmp_oeq(half %a, half %b, i32 %c, i32 %d) nounwind { ; CHECKIZHINXMIN: # %bb.0: ; CHECKIZHINXMIN-NEXT: fcvt.s.h a1, a1 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: feq.s a1, a0, a1 -; CHECKIZHINXMIN-NEXT: mv a0, a2 -; CHECKIZHINXMIN-NEXT: bnez a1, .LBB16_2 +; CHECKIZHINXMIN-NEXT: feq.s a0, a0, a1 +; CHECKIZHINXMIN-NEXT: bnez a0, .LBB16_2 ; CHECKIZHINXMIN-NEXT: # %bb.1: -; CHECKIZHINXMIN-NEXT: mv a0, a3 +; CHECKIZHINXMIN-NEXT: mv a2, a3 ; CHECKIZHINXMIN-NEXT: .LBB16_2: +; CHECKIZHINXMIN-NEXT: mv a0, a2 ; CHECKIZHINXMIN-NEXT: ret %1 = fcmp oeq half %a, %b %2 = select i1 %1, i32 %c, i32 %d diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll index 66cde323ce507..00fac434517c4 100644 --- a/llvm/test/CodeGen/RISCV/iabs.ll +++ b/llvm/test/CodeGen/RISCV/iabs.ll @@ -301,58 +301,58 @@ define i64 @select_abs64(i64 %x) { define i128 @abs128(i128 %x) { ; RV32I-LABEL: abs128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 12(a1) ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a1, 8(a1) -; RV32I-NEXT: bgez a3, .LBB8_2 +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: bgez a1, .LBB8_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: neg a5, a1 -; RV32I-NEXT: snez a6, a4 +; RV32I-NEXT: neg a5, a4 +; RV32I-NEXT: snez a6, a3 ; RV32I-NEXT: snez a7, a2 -; RV32I-NEXT: snez a1, a1 -; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: snez a4, a4 +; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: sub a4, a4, a7 -; RV32I-NEXT: sltu a3, a5, a6 -; RV32I-NEXT: neg a7, a1 -; RV32I-NEXT: sub a1, a5, a6 -; RV32I-NEXT: sub a3, a7, a3 +; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: sub a3, a3, a7 +; RV32I-NEXT: sltu a7, a5, a6 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a4, a5, a6 +; RV32I-NEXT: sub a1, a1, a7 ; RV32I-NEXT: neg a2, a2 ; RV32I-NEXT: .LBB8_2: ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a4, 4(a0) -; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a3, 4(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: abs128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a3, 12(a1) ; RV32ZBB-NEXT: lw a2, 0(a1) -; RV32ZBB-NEXT: lw a4, 4(a1) -; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: bgez a3, .LBB8_2 +; RV32ZBB-NEXT: lw a3, 4(a1) +; RV32ZBB-NEXT: lw a4, 8(a1) +; RV32ZBB-NEXT: lw a1, 12(a1) +; RV32ZBB-NEXT: bgez a1, .LBB8_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: neg a5, a1 -; RV32ZBB-NEXT: snez a6, a4 +; RV32ZBB-NEXT: neg a5, a4 +; RV32ZBB-NEXT: snez a6, a3 ; RV32ZBB-NEXT: snez a7, a2 -; RV32ZBB-NEXT: snez a1, a1 -; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: snez a4, a4 +; RV32ZBB-NEXT: neg a3, a3 ; RV32ZBB-NEXT: or a6, a7, a6 -; RV32ZBB-NEXT: add a1, a3, a1 -; RV32ZBB-NEXT: sub a4, a4, a7 -; RV32ZBB-NEXT: sltu a3, a5, a6 -; RV32ZBB-NEXT: neg a7, a1 -; RV32ZBB-NEXT: sub a1, a5, a6 -; RV32ZBB-NEXT: sub a3, a7, a3 +; RV32ZBB-NEXT: add a1, a1, a4 +; RV32ZBB-NEXT: sub a3, a3, a7 +; RV32ZBB-NEXT: sltu a7, a5, a6 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a4, a5, a6 +; RV32ZBB-NEXT: sub a1, a1, a7 ; RV32ZBB-NEXT: neg a2, a2 ; RV32ZBB-NEXT: .LBB8_2: ; RV32ZBB-NEXT: sw a2, 0(a0) -; RV32ZBB-NEXT: sw a4, 4(a0) -; RV32ZBB-NEXT: sw a1, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a3, 4(a0) +; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: abs128: @@ -383,58 +383,58 @@ define i128 @abs128(i128 %x) { define i128 @select_abs128(i128 %x) { ; RV32I-LABEL: select_abs128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a3, 12(a1) ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a1, 8(a1) -; RV32I-NEXT: bgez a3, .LBB9_2 +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: bgez a1, .LBB9_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: neg a5, a1 -; RV32I-NEXT: snez a6, a4 +; RV32I-NEXT: neg a5, a4 +; RV32I-NEXT: snez a6, a3 ; RV32I-NEXT: snez a7, a2 -; RV32I-NEXT: snez a1, a1 -; RV32I-NEXT: neg a4, a4 +; RV32I-NEXT: snez a4, a4 +; RV32I-NEXT: neg a3, a3 ; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: sub a4, a4, a7 -; RV32I-NEXT: sltu a3, a5, a6 -; RV32I-NEXT: neg a7, a1 -; RV32I-NEXT: sub a1, a5, a6 -; RV32I-NEXT: sub a3, a7, a3 +; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: sub a3, a3, a7 +; RV32I-NEXT: sltu a7, a5, a6 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a4, a5, a6 +; RV32I-NEXT: sub a1, a1, a7 ; RV32I-NEXT: neg a2, a2 ; RV32I-NEXT: .LBB9_2: ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a4, 4(a0) -; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a3, 4(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: select_abs128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a3, 12(a1) ; RV32ZBB-NEXT: lw a2, 0(a1) -; RV32ZBB-NEXT: lw a4, 4(a1) -; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: bgez a3, .LBB9_2 +; RV32ZBB-NEXT: lw a3, 4(a1) +; RV32ZBB-NEXT: lw a4, 8(a1) +; RV32ZBB-NEXT: lw a1, 12(a1) +; RV32ZBB-NEXT: bgez a1, .LBB9_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: neg a5, a1 -; RV32ZBB-NEXT: snez a6, a4 +; RV32ZBB-NEXT: neg a5, a4 +; RV32ZBB-NEXT: snez a6, a3 ; RV32ZBB-NEXT: snez a7, a2 -; RV32ZBB-NEXT: snez a1, a1 -; RV32ZBB-NEXT: neg a4, a4 +; RV32ZBB-NEXT: snez a4, a4 +; RV32ZBB-NEXT: neg a3, a3 ; RV32ZBB-NEXT: or a6, a7, a6 -; RV32ZBB-NEXT: add a1, a3, a1 -; RV32ZBB-NEXT: sub a4, a4, a7 -; RV32ZBB-NEXT: sltu a3, a5, a6 -; RV32ZBB-NEXT: neg a7, a1 -; RV32ZBB-NEXT: sub a1, a5, a6 -; RV32ZBB-NEXT: sub a3, a7, a3 +; RV32ZBB-NEXT: add a1, a1, a4 +; RV32ZBB-NEXT: sub a3, a3, a7 +; RV32ZBB-NEXT: sltu a7, a5, a6 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a4, a5, a6 +; RV32ZBB-NEXT: sub a1, a1, a7 ; RV32ZBB-NEXT: neg a2, a2 ; RV32ZBB-NEXT: .LBB9_2: ; RV32ZBB-NEXT: sw a2, 0(a0) -; RV32ZBB-NEXT: sw a4, 4(a0) -; RV32ZBB-NEXT: sw a1, 8(a0) -; RV32ZBB-NEXT: sw a3, 12(a0) +; RV32ZBB-NEXT: sw a3, 4(a0) +; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sw a1, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: select_abs128: diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll index d58e6fe7675da..bbc4c3735de45 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll @@ -26,11 +26,11 @@ define double @constraint_f_double(double %a) nounwind { ; ; RV64F-LABEL: constraint_f_double: ; RV64F: # %bb.0: -; RV64F-NEXT: lui a1, %hi(gd) -; RV64F-NEXT: fld fa5, %lo(gd)(a1) -; RV64F-NEXT: fmv.d.x fa4, a0 +; RV64F-NEXT: fmv.d.x fa5, a0 +; RV64F-NEXT: lui a0, %hi(gd) +; RV64F-NEXT: fld fa4, %lo(gd)(a0) ; RV64F-NEXT: #APP -; RV64F-NEXT: fadd.d fa5, fa4, fa5 +; RV64F-NEXT: fadd.d fa5, fa5, fa4 ; RV64F-NEXT: #NO_APP ; RV64F-NEXT: fmv.x.d a0, fa5 ; RV64F-NEXT: ret @@ -59,11 +59,11 @@ define double @constraint_cf_double(double %a) nounwind { ; ; RV64F-LABEL: constraint_cf_double: ; RV64F: # %bb.0: -; RV64F-NEXT: lui a1, %hi(gd) -; RV64F-NEXT: fld fa5, %lo(gd)(a1) -; RV64F-NEXT: fmv.d.x fa4, a0 +; RV64F-NEXT: fmv.d.x fa5, a0 +; RV64F-NEXT: lui a0, %hi(gd) +; RV64F-NEXT: fld fa4, %lo(gd)(a0) ; RV64F-NEXT: #APP -; RV64F-NEXT: fadd.d fa5, fa4, fa5 +; RV64F-NEXT: fadd.d fa5, fa5, fa4 ; RV64F-NEXT: #NO_APP ; RV64F-NEXT: fmv.x.d a0, fa5 ; RV64F-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll index 238a0fa0b6fd7..144ddb99e5c4c 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll @@ -29,11 +29,11 @@ define double @constraint_f_double(double %a) nounwind { ; ; RV64F-LABEL: constraint_f_double: ; RV64F: # %bb.0: -; RV64F-NEXT: lui a1, %hi(gd) -; RV64F-NEXT: fld fa5, %lo(gd)(a1) -; RV64F-NEXT: fmv.d.x fa4, a0 +; RV64F-NEXT: fmv.d.x fa5, a0 +; RV64F-NEXT: lui a0, %hi(gd) +; RV64F-NEXT: fld fa4, %lo(gd)(a0) ; RV64F-NEXT: #APP -; RV64F-NEXT: .insn 0x4, 0x02000053 | (15 << 7) | (14 << 15) | (15 << 20) +; RV64F-NEXT: .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20) ; RV64F-NEXT: #NO_APP ; RV64F-NEXT: fmv.x.d a0, fa5 ; RV64F-NEXT: ret @@ -62,11 +62,11 @@ define double @constraint_cf_double(double %a) nounwind { ; ; RV64F-LABEL: constraint_cf_double: ; RV64F: # %bb.0: -; RV64F-NEXT: lui a1, %hi(gd) -; RV64F-NEXT: fld fa5, %lo(gd)(a1) -; RV64F-NEXT: fmv.d.x fa4, a0 +; RV64F-NEXT: fmv.d.x fa5, a0 +; RV64F-NEXT: lui a0, %hi(gd) +; RV64F-NEXT: fld fa4, %lo(gd)(a0) ; RV64F-NEXT: #APP -; RV64F-NEXT: .insn 0x4, 0x02000053 | (15 << 7) | (14 << 15) | (15 << 20) +; RV64F-NEXT: .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20) ; RV64F-NEXT: #NO_APP ; RV64F-NEXT: fmv.x.d a0, fa5 ; RV64F-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll index f17f5ba15c605..8ed247d1398ad 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-f-constraint-f.ll @@ -13,22 +13,22 @@ define float @constraint_f_float(float %a) nounwind { ; RV32F-LABEL: constraint_f_float: ; RV32F: # %bb.0: -; RV32F-NEXT: lui a1, %hi(gf) -; RV32F-NEXT: flw fa5, %lo(gf)(a1) -; RV32F-NEXT: fmv.w.x fa4, a0 +; RV32F-NEXT: fmv.w.x fa5, a0 +; RV32F-NEXT: lui a0, %hi(gf) +; RV32F-NEXT: flw fa4, %lo(gf)(a0) ; RV32F-NEXT: #APP -; RV32F-NEXT: fadd.s fa5, fa4, fa5 +; RV32F-NEXT: fadd.s fa5, fa5, fa4 ; RV32F-NEXT: #NO_APP ; RV32F-NEXT: fmv.x.w a0, fa5 ; RV32F-NEXT: ret ; ; RV64F-LABEL: constraint_f_float: ; RV64F: # %bb.0: -; RV64F-NEXT: lui a1, %hi(gf) -; RV64F-NEXT: flw fa5, %lo(gf)(a1) -; RV64F-NEXT: fmv.w.x fa4, a0 +; RV64F-NEXT: fmv.w.x fa5, a0 +; RV64F-NEXT: lui a0, %hi(gf) +; RV64F-NEXT: flw fa4, %lo(gf)(a0) ; RV64F-NEXT: #APP -; RV64F-NEXT: fadd.s fa5, fa4, fa5 +; RV64F-NEXT: fadd.s fa5, fa5, fa4 ; RV64F-NEXT: #NO_APP ; RV64F-NEXT: fmv.x.w a0, fa5 ; RV64F-NEXT: ret @@ -40,22 +40,22 @@ define float @constraint_f_float(float %a) nounwind { define float @constraint_cf_float(float %a) nounwind { ; RV32F-LABEL: constraint_cf_float: ; RV32F: # %bb.0: -; RV32F-NEXT: lui a1, %hi(gf) -; RV32F-NEXT: flw fa5, %lo(gf)(a1) -; RV32F-NEXT: fmv.w.x fa4, a0 +; RV32F-NEXT: fmv.w.x fa5, a0 +; RV32F-NEXT: lui a0, %hi(gf) +; RV32F-NEXT: flw fa4, %lo(gf)(a0) ; RV32F-NEXT: #APP -; RV32F-NEXT: fadd.s fa5, fa4, fa5 +; RV32F-NEXT: fadd.s fa5, fa5, fa4 ; RV32F-NEXT: #NO_APP ; RV32F-NEXT: fmv.x.w a0, fa5 ; RV32F-NEXT: ret ; ; RV64F-LABEL: constraint_cf_float: ; RV64F: # %bb.0: -; RV64F-NEXT: lui a1, %hi(gf) -; RV64F-NEXT: flw fa5, %lo(gf)(a1) -; RV64F-NEXT: fmv.w.x fa4, a0 +; RV64F-NEXT: fmv.w.x fa5, a0 +; RV64F-NEXT: lui a0, %hi(gf) +; RV64F-NEXT: flw fa4, %lo(gf)(a0) ; RV64F-NEXT: #APP -; RV64F-NEXT: fadd.s fa5, fa4, fa5 +; RV64F-NEXT: fadd.s fa5, fa5, fa4 ; RV64F-NEXT: #NO_APP ; RV64F-NEXT: fmv.x.w a0, fa5 ; RV64F-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll b/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll index a0de5c71a7df6..10ed6367a49c2 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-f-modifier-N.ll @@ -16,22 +16,22 @@ define float @constraint_f_modifier_N_float(float %a) nounwind { ; RV32F-LABEL: constraint_f_modifier_N_float: ; RV32F: # %bb.0: -; RV32F-NEXT: lui a1, %hi(gf) -; RV32F-NEXT: flw fa5, %lo(gf)(a1) -; RV32F-NEXT: fmv.w.x fa4, a0 +; RV32F-NEXT: fmv.w.x fa5, a0 +; RV32F-NEXT: lui a0, %hi(gf) +; RV32F-NEXT: flw fa4, %lo(gf)(a0) ; RV32F-NEXT: #APP -; RV32F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20) +; RV32F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (15 << 15) | (14 << 20) ; RV32F-NEXT: #NO_APP ; RV32F-NEXT: fmv.x.w a0, fa5 ; RV32F-NEXT: ret ; ; RV64F-LABEL: constraint_f_modifier_N_float: ; RV64F: # %bb.0: -; RV64F-NEXT: lui a1, %hi(gf) -; RV64F-NEXT: flw fa5, %lo(gf)(a1) -; RV64F-NEXT: fmv.w.x fa4, a0 +; RV64F-NEXT: fmv.w.x fa5, a0 +; RV64F-NEXT: lui a0, %hi(gf) +; RV64F-NEXT: flw fa4, %lo(gf)(a0) ; RV64F-NEXT: #APP -; RV64F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20) +; RV64F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (15 << 15) | (14 << 20) ; RV64F-NEXT: #NO_APP ; RV64F-NEXT: fmv.x.w a0, fa5 ; RV64F-NEXT: ret @@ -44,22 +44,22 @@ define float @constraint_f_modifier_N_float(float %a) nounwind { define float @constraint_cf_modifier_N_float(float %a) nounwind { ; RV32F-LABEL: constraint_cf_modifier_N_float: ; RV32F: # %bb.0: -; RV32F-NEXT: lui a1, %hi(gf) -; RV32F-NEXT: flw fa5, %lo(gf)(a1) -; RV32F-NEXT: fmv.w.x fa4, a0 +; RV32F-NEXT: fmv.w.x fa5, a0 +; RV32F-NEXT: lui a0, %hi(gf) +; RV32F-NEXT: flw fa4, %lo(gf)(a0) ; RV32F-NEXT: #APP -; RV32F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20) +; RV32F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (15 << 15) | (14 << 20) ; RV32F-NEXT: #NO_APP ; RV32F-NEXT: fmv.x.w a0, fa5 ; RV32F-NEXT: ret ; ; RV64F-LABEL: constraint_cf_modifier_N_float: ; RV64F: # %bb.0: -; RV64F-NEXT: lui a1, %hi(gf) -; RV64F-NEXT: flw fa5, %lo(gf)(a1) -; RV64F-NEXT: fmv.w.x fa4, a0 +; RV64F-NEXT: fmv.w.x fa5, a0 +; RV64F-NEXT: lui a0, %hi(gf) +; RV64F-NEXT: flw fa4, %lo(gf)(a0) ; RV64F-NEXT: #APP -; RV64F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (14 << 15) | (15 << 20) +; RV64F-NEXT: .insn 0x4, 0x53 | (15 << 7) | (15 << 15) | (14 << 20) ; RV64F-NEXT: #NO_APP ; RV64F-NEXT: fmv.x.w a0, fa5 ; RV64F-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll b/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll index 1c0de6c3f1612..4c15eaf7954d4 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-zfinx-constraint-r.ll @@ -57,9 +57,9 @@ define float @constraint_float_abi_name(float %a) nounwind { ; RV32FINX: # %bb.0: ; RV32FINX-NEXT: addi sp, sp, -16 ; RV32FINX-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32FINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 ; RV32FINX-NEXT: lui a1, %hi(gf) ; RV32FINX-NEXT: lw s0, %lo(gf)(a1) -; RV32FINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 ; RV32FINX-NEXT: #APP ; RV32FINX-NEXT: fadd.s t0, a0, s0 ; RV32FINX-NEXT: #NO_APP @@ -72,9 +72,9 @@ define float @constraint_float_abi_name(float %a) nounwind { ; RV64FINX: # %bb.0: ; RV64FINX-NEXT: addi sp, sp, -16 ; RV64FINX-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64FINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 ; RV64FINX-NEXT: lui a1, %hi(gf) ; RV64FINX-NEXT: lw s0, %lo(gf)(a1) -; RV64FINX-NEXT: # kill: def $x10_w killed $x10_w def $x10 ; RV64FINX-NEXT: #APP ; RV64FINX-NEXT: fadd.s t0, a0, s0 ; RV64FINX-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll b/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll index 086d2a1d6f3b2..4482d68eba122 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-zhinx-constraint-r.ll @@ -97,9 +97,9 @@ define half @constraint_half_abi_name(half %a) nounwind { ; RV32ZHINX: # %bb.0: ; RV32ZHINX-NEXT: addi sp, sp, -16 ; RV32ZHINX-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV32ZHINX-NEXT: lui a1, %hi(gh) ; RV32ZHINX-NEXT: lh s0, %lo(gh)(a1) -; RV32ZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV32ZHINX-NEXT: #APP ; RV32ZHINX-NEXT: fadd.s t0, a0, s0 ; RV32ZHINX-NEXT: #NO_APP @@ -112,9 +112,9 @@ define half @constraint_half_abi_name(half %a) nounwind { ; RV64ZHINX: # %bb.0: ; RV64ZHINX-NEXT: addi sp, sp, -16 ; RV64ZHINX-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64ZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV64ZHINX-NEXT: lui a1, %hi(gh) ; RV64ZHINX-NEXT: lh s0, %lo(gh)(a1) -; RV64ZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV64ZHINX-NEXT: #APP ; RV64ZHINX-NEXT: fadd.s t0, a0, s0 ; RV64ZHINX-NEXT: #NO_APP @@ -127,9 +127,9 @@ define half @constraint_half_abi_name(half %a) nounwind { ; RV32DINXZHINX: # %bb.0: ; RV32DINXZHINX-NEXT: addi sp, sp, -16 ; RV32DINXZHINX-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32DINXZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV32DINXZHINX-NEXT: lui a1, %hi(gh) ; RV32DINXZHINX-NEXT: lh s0, %lo(gh)(a1) -; RV32DINXZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV32DINXZHINX-NEXT: #APP ; RV32DINXZHINX-NEXT: fadd.s t0, a0, s0 ; RV32DINXZHINX-NEXT: #NO_APP @@ -142,9 +142,9 @@ define half @constraint_half_abi_name(half %a) nounwind { ; RV64DINXZHINX: # %bb.0: ; RV64DINXZHINX-NEXT: addi sp, sp, -16 ; RV64DINXZHINX-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64DINXZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV64DINXZHINX-NEXT: lui a1, %hi(gh) ; RV64DINXZHINX-NEXT: lh s0, %lo(gh)(a1) -; RV64DINXZHINX-NEXT: # kill: def $x10_h killed $x10_h def $x10 ; RV64DINXZHINX-NEXT: #APP ; RV64DINXZHINX-NEXT: fadd.s t0, a0, s0 ; RV64DINXZHINX-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/RISCV/inline-asm.ll b/llvm/test/CodeGen/RISCV/inline-asm.ll index 79266743a1d05..7382ab4d3d1c2 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm.ll @@ -34,21 +34,21 @@ define i32 @constraint_r(i32 %a) nounwind { define i32 @constraint_r_zero(i32 %a) nounwind { ; RV32I-LABEL: constraint_r_zero: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a0, %hi(gi) -; RV32I-NEXT: lw a0, %lo(gi)(a0) -; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: lui a1, %hi(gi) +; RV32I-NEXT: lw a1, %lo(gi)(a1) ; RV32I-NEXT: #APP -; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: #NO_APP ; RV32I-NEXT: ret ; ; RV64I-LABEL: constraint_r_zero: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a0, %hi(gi) -; RV64I-NEXT: lw a0, %lo(gi)(a0) -; RV64I-NEXT: li a1, 0 +; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: lui a1, %hi(gi) +; RV64I-NEXT: lw a1, %lo(gi)(a1) ; RV64I-NEXT: #APP -; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: #NO_APP ; RV64I-NEXT: ret %1 = load i32, ptr @gi diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll index 111b3e2bf82ce..391448b28c20b 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll @@ -75,15 +75,15 @@ define i64 @ctz_nxv8i1_no_range( %a) { ; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vsetvli a3, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: vid.v v8 -; RV32-NEXT: li a2, -1 -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vl2r.v v24, (a3) # Unknown-size Folded Reload +; RV32-NEXT: li a3, -1 +; RV32-NEXT: addi a4, sp, 32 +; RV32-NEXT: vl2r.v v16, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vmsne.vi v0, v24, 0 +; RV32-NEXT: vmsne.vi v0, v16, 0 +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vmadd.vx v8, a2, v16 +; RV32-NEXT: vmadd.vx v8, a3, v16 ; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vmerge.vim v16, v16, -1, v0 diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll index f60b77b92c09e..38cce2121c91d 100644 --- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll +++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll @@ -30,12 +30,12 @@ entry: define void @test2(ptr %a, ptr %b) nounwind { ; RV32-LABEL: test2: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lw a2, 4(a1) -; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a2, 0(a1) +; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: lui a3, 524288 -; RV32-NEXT: xor a2, a2, a3 -; RV32-NEXT: sw a1, 0(a0) -; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: xor a1, a1, a3 +; RV32-NEXT: sw a2, 0(a0) +; RV32-NEXT: sw a1, 4(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: test2: @@ -56,27 +56,27 @@ entry: define void @test3(ptr %a, ptr %b) nounwind { ; RV32-LABEL: test3: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lw a2, 12(a1) -; RV32-NEXT: lw a3, 0(a1) -; RV32-NEXT: lw a4, 4(a1) -; RV32-NEXT: lw a1, 8(a1) +; RV32-NEXT: lw a2, 0(a1) +; RV32-NEXT: lw a3, 4(a1) +; RV32-NEXT: lw a4, 8(a1) +; RV32-NEXT: lw a1, 12(a1) ; RV32-NEXT: lui a5, 524288 -; RV32-NEXT: xor a2, a2, a5 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a4, 4(a0) -; RV32-NEXT: sw a1, 8(a0) -; RV32-NEXT: sw a2, 12(a0) +; RV32-NEXT: xor a1, a1, a5 +; RV32-NEXT: sw a2, 0(a0) +; RV32-NEXT: sw a3, 4(a0) +; RV32-NEXT: sw a4, 8(a0) +; RV32-NEXT: sw a1, 12(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: test3: ; RV64: # %bb.0: # %entry -; RV64-NEXT: ld a2, 8(a1) -; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: ld a2, 0(a1) +; RV64-NEXT: ld a1, 8(a1) ; RV64-NEXT: li a3, -1 ; RV64-NEXT: slli a3, a3, 63 -; RV64-NEXT: xor a2, a2, a3 -; RV64-NEXT: sd a1, 0(a0) -; RV64-NEXT: sd a2, 8(a0) +; RV64-NEXT: xor a1, a1, a3 +; RV64-NEXT: sd a2, 0(a0) +; RV64-NEXT: sd a1, 8(a0) ; RV64-NEXT: ret entry: %0 = load fp128, ptr %b diff --git a/llvm/test/CodeGen/RISCV/llvm.exp10.ll b/llvm/test/CodeGen/RISCV/llvm.exp10.ll index 7b199504837e8..51189ef60e852 100644 --- a/llvm/test/CodeGen/RISCV/llvm.exp10.ll +++ b/llvm/test/CodeGen/RISCV/llvm.exp10.ll @@ -143,12 +143,12 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) { ; RV32IFD-NEXT: .cfi_offset fs1, -32 ; RV32IFD-NEXT: .cfi_offset fs2, -40 ; RV32IFD-NEXT: mv s0, a0 -; RV32IFD-NEXT: lhu a0, 8(a1) -; RV32IFD-NEXT: lhu a2, 0(a1) -; RV32IFD-NEXT: lhu a1, 4(a1) -; RV32IFD-NEXT: fmv.w.x fs0, a0 -; RV32IFD-NEXT: fmv.w.x fs1, a2 -; RV32IFD-NEXT: fmv.w.x fa0, a1 +; RV32IFD-NEXT: lhu a0, 0(a1) +; RV32IFD-NEXT: lhu a2, 4(a1) +; RV32IFD-NEXT: lhu a1, 8(a1) +; RV32IFD-NEXT: fmv.w.x fs0, a1 +; RV32IFD-NEXT: fmv.w.x fs1, a0 +; RV32IFD-NEXT: fmv.w.x fa0, a2 ; RV32IFD-NEXT: call __extendhfsf2 ; RV32IFD-NEXT: call exp10f ; RV32IFD-NEXT: call __truncsfhf2 @@ -200,11 +200,11 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) { ; RV64IFD-NEXT: .cfi_offset s1, -24 ; RV64IFD-NEXT: .cfi_offset s2, -32 ; RV64IFD-NEXT: .cfi_offset fs0, -40 +; RV64IFD-NEXT: mv s0, a0 ; RV64IFD-NEXT: lhu s1, 0(a1) -; RV64IFD-NEXT: lhu a2, 8(a1) +; RV64IFD-NEXT: lhu a0, 8(a1) ; RV64IFD-NEXT: lhu s2, 16(a1) -; RV64IFD-NEXT: mv s0, a0 -; RV64IFD-NEXT: fmv.w.x fa0, a2 +; RV64IFD-NEXT: fmv.w.x fa0, a0 ; RV64IFD-NEXT: call __extendhfsf2 ; RV64IFD-NEXT: call exp10f ; RV64IFD-NEXT: call __truncsfhf2 @@ -267,14 +267,14 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) { ; RV32IFD-NEXT: .cfi_offset fs2, -48 ; RV32IFD-NEXT: .cfi_offset fs3, -56 ; RV32IFD-NEXT: mv s0, a0 -; RV32IFD-NEXT: lhu a0, 12(a1) -; RV32IFD-NEXT: lhu a2, 0(a1) -; RV32IFD-NEXT: lhu a3, 4(a1) -; RV32IFD-NEXT: lhu a1, 8(a1) -; RV32IFD-NEXT: fmv.w.x fs0, a0 -; RV32IFD-NEXT: fmv.w.x fs1, a2 -; RV32IFD-NEXT: fmv.w.x fs2, a3 -; RV32IFD-NEXT: fmv.w.x fa0, a1 +; RV32IFD-NEXT: lhu a0, 0(a1) +; RV32IFD-NEXT: lhu a2, 4(a1) +; RV32IFD-NEXT: lhu a3, 8(a1) +; RV32IFD-NEXT: lhu a1, 12(a1) +; RV32IFD-NEXT: fmv.w.x fs0, a1 +; RV32IFD-NEXT: fmv.w.x fs1, a0 +; RV32IFD-NEXT: fmv.w.x fs2, a2 +; RV32IFD-NEXT: fmv.w.x fa0, a3 ; RV32IFD-NEXT: call __extendhfsf2 ; RV32IFD-NEXT: call exp10f ; RV32IFD-NEXT: call __truncsfhf2 @@ -343,12 +343,12 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) { ; RV64IFD-NEXT: .cfi_offset fs0, -48 ; RV64IFD-NEXT: .cfi_offset fs1, -56 ; RV64IFD-NEXT: .cfi_offset fs2, -64 +; RV64IFD-NEXT: mv s0, a0 ; RV64IFD-NEXT: lhu s1, 0(a1) ; RV64IFD-NEXT: lhu s2, 8(a1) -; RV64IFD-NEXT: lhu a2, 16(a1) +; RV64IFD-NEXT: lhu a0, 16(a1) ; RV64IFD-NEXT: lhu s3, 24(a1) -; RV64IFD-NEXT: mv s0, a0 -; RV64IFD-NEXT: fmv.w.x fa0, a2 +; RV64IFD-NEXT: fmv.w.x fa0, a0 ; RV64IFD-NEXT: call __extendhfsf2 ; RV64IFD-NEXT: call exp10f ; RV64IFD-NEXT: call __truncsfhf2 diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll index 4a77b4d32cdda..28f56e49b6693 100644 --- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll +++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll @@ -730,38 +730,37 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV32I-NEXT: sw s2, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 28(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw s0, 4(a1) -; RV32I-NEXT: lw s1, 8(a1) -; RV32I-NEXT: lw s2, 12(a1) -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw s1, 4(a1) +; RV32I-NEXT: lw s2, 8(a1) +; RV32I-NEXT: lw s3, 12(a1) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: addi a1, sp, 12 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call frexpf -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, sp, 16 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: addi a1, sp, 20 +; RV32I-NEXT: addi a1, sp, 16 ; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call frexpf +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: addi a1, sp, 20 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call frexpf ; RV32I-NEXT: lw a1, 8(sp) ; RV32I-NEXT: lw a2, 12(sp) ; RV32I-NEXT: lw a3, 16(sp) ; RV32I-NEXT: lw a4, 20(sp) -; RV32I-NEXT: sw s4, 0(s3) -; RV32I-NEXT: sw s0, 4(s3) -; RV32I-NEXT: sw s1, 8(s3) -; RV32I-NEXT: sw a0, 12(s3) -; RV32I-NEXT: sw a1, 16(s3) -; RV32I-NEXT: sw a2, 20(s3) -; RV32I-NEXT: sw a3, 24(s3) -; RV32I-NEXT: sw a4, 28(s3) +; RV32I-NEXT: sw s4, 0(s0) +; RV32I-NEXT: sw s1, 4(s0) +; RV32I-NEXT: sw s2, 8(s0) +; RV32I-NEXT: sw a0, 12(s0) +; RV32I-NEXT: sw a1, 16(s0) +; RV32I-NEXT: sw a2, 20(s0) +; RV32I-NEXT: sw a3, 24(s0) +; RV32I-NEXT: sw a4, 28(s0) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -780,38 +779,37 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a2, 0(a1) -; RV64I-NEXT: lw s0, 8(a1) -; RV64I-NEXT: lw s1, 16(a1) -; RV64I-NEXT: lw s2, 24(a1) -; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lw a0, 0(a1) +; RV64I-NEXT: lw s1, 8(a1) +; RV64I-NEXT: lw s2, 16(a1) +; RV64I-NEXT: lw s3, 24(a1) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: addi a1, sp, 4 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call frexpf -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, sp, 8 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: addi a1, sp, 12 +; RV64I-NEXT: addi a1, sp, 8 ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call frexpf +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: addi a1, sp, 12 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call frexpf ; RV64I-NEXT: lw a1, 0(sp) ; RV64I-NEXT: lw a2, 4(sp) ; RV64I-NEXT: lw a3, 8(sp) ; RV64I-NEXT: lw a4, 12(sp) -; RV64I-NEXT: sw s4, 0(s3) -; RV64I-NEXT: sw s0, 4(s3) -; RV64I-NEXT: sw s1, 8(s3) -; RV64I-NEXT: sw a0, 12(s3) -; RV64I-NEXT: sw a1, 16(s3) -; RV64I-NEXT: sw a2, 20(s3) -; RV64I-NEXT: sw a3, 24(s3) -; RV64I-NEXT: sw a4, 28(s3) +; RV64I-NEXT: sw s4, 0(s0) +; RV64I-NEXT: sw s1, 4(s0) +; RV64I-NEXT: sw s2, 8(s0) +; RV64I-NEXT: sw a0, 12(s0) +; RV64I-NEXT: sw a1, 16(s0) +; RV64I-NEXT: sw a2, 20(s0) +; RV64I-NEXT: sw a3, 24(s0) +; RV64I-NEXT: sw a4, 28(s0) ; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload @@ -998,30 +996,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi ; RV32I-NEXT: sw s2, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 28(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw s0, 4(a1) -; RV32I-NEXT: lw s1, 8(a1) -; RV32I-NEXT: lw s2, 12(a1) -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw s1, 4(a1) +; RV32I-NEXT: lw s2, 8(a1) +; RV32I-NEXT: lw s3, 12(a1) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: addi a1, sp, 12 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call frexpf -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: addi a1, sp, 16 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: addi a1, sp, 20 +; RV32I-NEXT: addi a1, sp, 16 ; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call frexpf -; RV32I-NEXT: sw s4, 0(s3) -; RV32I-NEXT: sw s0, 4(s3) -; RV32I-NEXT: sw s1, 8(s3) -; RV32I-NEXT: sw a0, 12(s3) +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: addi a1, sp, 20 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call frexpf +; RV32I-NEXT: sw s4, 0(s0) +; RV32I-NEXT: sw s1, 4(s0) +; RV32I-NEXT: sw s2, 8(s0) +; RV32I-NEXT: sw a0, 12(s0) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -1040,30 +1037,29 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi ; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a2, 0(a1) -; RV64I-NEXT: lw s0, 8(a1) -; RV64I-NEXT: lw s1, 16(a1) -; RV64I-NEXT: lw s2, 24(a1) -; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lw a0, 0(a1) +; RV64I-NEXT: lw s1, 8(a1) +; RV64I-NEXT: lw s2, 16(a1) +; RV64I-NEXT: lw s3, 24(a1) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: addi a1, sp, 4 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call frexpf -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: addi a1, sp, 8 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: addi a1, sp, 12 +; RV64I-NEXT: addi a1, sp, 8 ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call frexpf -; RV64I-NEXT: sw s4, 0(s3) -; RV64I-NEXT: sw s0, 4(s3) -; RV64I-NEXT: sw s1, 8(s3) -; RV64I-NEXT: sw a0, 12(s3) +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: addi a1, sp, 12 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call frexpf +; RV64I-NEXT: sw s4, 0(s0) +; RV64I-NEXT: sw s1, 4(s0) +; RV64I-NEXT: sw s2, 8(s0) +; RV64I-NEXT: sw a0, 12(s0) ; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload @@ -1230,31 +1226,30 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; RV32I-NEXT: sw s1, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw s0, 4(a1) -; RV32I-NEXT: lw s1, 8(a1) -; RV32I-NEXT: lw s2, 12(a1) -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw s1, 4(a1) +; RV32I-NEXT: lw s2, 8(a1) +; RV32I-NEXT: lw s3, 12(a1) ; RV32I-NEXT: addi a1, sp, 12 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: addi a1, sp, 16 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: addi a1, sp, 20 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: addi a1, sp, 24 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s3 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: lw a0, 12(sp) ; RV32I-NEXT: lw a1, 16(sp) ; RV32I-NEXT: lw a2, 20(sp) ; RV32I-NEXT: lw a3, 24(sp) -; RV32I-NEXT: sw a0, 0(s3) -; RV32I-NEXT: sw a1, 4(s3) -; RV32I-NEXT: sw a2, 8(s3) -; RV32I-NEXT: sw a3, 12(s3) +; RV32I-NEXT: sw a0, 0(s0) +; RV32I-NEXT: sw a1, 4(s0) +; RV32I-NEXT: sw a2, 8(s0) +; RV32I-NEXT: sw a3, 12(s0) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -1271,31 +1266,30 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; RV64I-NEXT: sd s1, 40(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a2, 0(a1) -; RV64I-NEXT: lw s0, 8(a1) -; RV64I-NEXT: lw s1, 16(a1) -; RV64I-NEXT: lw s2, 24(a1) -; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lw a0, 0(a1) +; RV64I-NEXT: lw s1, 8(a1) +; RV64I-NEXT: lw s2, 16(a1) +; RV64I-NEXT: lw s3, 24(a1) ; RV64I-NEXT: addi a1, sp, 8 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: addi a1, sp, 12 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: addi a1, sp, 16 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: addi a1, sp, 20 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s3 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: lw a0, 8(sp) ; RV64I-NEXT: lw a1, 12(sp) ; RV64I-NEXT: lw a2, 16(sp) ; RV64I-NEXT: lw a3, 20(sp) -; RV64I-NEXT: sw a0, 0(s3) -; RV64I-NEXT: sw a1, 4(s3) -; RV64I-NEXT: sw a2, 8(s3) -; RV64I-NEXT: sw a3, 12(s3) +; RV64I-NEXT: sw a0, 0(s0) +; RV64I-NEXT: sw a1, 4(s0) +; RV64I-NEXT: sw a2, 8(s0) +; RV64I-NEXT: sw a3, 12(s0) ; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload @@ -1547,18 +1541,18 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind { ; RV32IFD-NEXT: addi sp, sp, -48 ; RV32IFD-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; RV32IFD-NEXT: lw a3, 0(a1) -; RV32IFD-NEXT: lw a4, 4(a1) -; RV32IFD-NEXT: lw a5, 8(a1) -; RV32IFD-NEXT: lw a6, 12(a1) ; RV32IFD-NEXT: mv s0, a0 +; RV32IFD-NEXT: lw a0, 0(a1) +; RV32IFD-NEXT: lw a2, 4(a1) +; RV32IFD-NEXT: lw a3, 8(a1) +; RV32IFD-NEXT: lw a1, 12(a1) +; RV32IFD-NEXT: sw a0, 0(sp) +; RV32IFD-NEXT: sw a2, 4(sp) +; RV32IFD-NEXT: sw a3, 8(sp) +; RV32IFD-NEXT: sw a1, 12(sp) ; RV32IFD-NEXT: addi a0, sp, 16 ; RV32IFD-NEXT: mv a1, sp ; RV32IFD-NEXT: addi a2, sp, 36 -; RV32IFD-NEXT: sw a3, 0(sp) -; RV32IFD-NEXT: sw a4, 4(sp) -; RV32IFD-NEXT: sw a5, 8(sp) -; RV32IFD-NEXT: sw a6, 12(sp) ; RV32IFD-NEXT: call frexpl ; RV32IFD-NEXT: lw a0, 36(sp) ; RV32IFD-NEXT: lw a1, 16(sp) @@ -1600,18 +1594,18 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind { ; RV32IZFINXZDINX-NEXT: addi sp, sp, -48 ; RV32IZFINXZDINX-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: lw a3, 0(a1) -; RV32IZFINXZDINX-NEXT: lw a4, 4(a1) -; RV32IZFINXZDINX-NEXT: lw a5, 8(a1) -; RV32IZFINXZDINX-NEXT: lw a6, 12(a1) ; RV32IZFINXZDINX-NEXT: mv s0, a0 +; RV32IZFINXZDINX-NEXT: lw a0, 0(a1) +; RV32IZFINXZDINX-NEXT: lw a2, 4(a1) +; RV32IZFINXZDINX-NEXT: lw a3, 8(a1) +; RV32IZFINXZDINX-NEXT: lw a1, 12(a1) +; RV32IZFINXZDINX-NEXT: sw a0, 0(sp) +; RV32IZFINXZDINX-NEXT: sw a2, 4(sp) +; RV32IZFINXZDINX-NEXT: sw a3, 8(sp) +; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: addi a0, sp, 16 ; RV32IZFINXZDINX-NEXT: mv a1, sp ; RV32IZFINXZDINX-NEXT: addi a2, sp, 36 -; RV32IZFINXZDINX-NEXT: sw a3, 0(sp) -; RV32IZFINXZDINX-NEXT: sw a4, 4(sp) -; RV32IZFINXZDINX-NEXT: sw a5, 8(sp) -; RV32IZFINXZDINX-NEXT: sw a6, 12(sp) ; RV32IZFINXZDINX-NEXT: call frexpl ; RV32IZFINXZDINX-NEXT: lw a0, 36(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 16(sp) @@ -1653,18 +1647,18 @@ define { fp128, i32 } @test_frexp_f128_i32(fp128 %a) nounwind { ; RV32I-NEXT: addi sp, sp, -48 ; RV32I-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a5, 8(a1) -; RV32I-NEXT: lw a6, 12(a1) ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw a2, 4(a1) +; RV32I-NEXT: lw a3, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: sw a0, 0(sp) +; RV32I-NEXT: sw a2, 4(sp) +; RV32I-NEXT: sw a3, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a0, sp, 16 ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a2, sp, 36 -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: call frexpl ; RV32I-NEXT: lw a0, 36(sp) ; RV32I-NEXT: lw a1, 16(sp) @@ -1710,18 +1704,18 @@ define fp128 @test_frexp_f128_i32_only_use_fract(fp128 %a) nounwind { ; RV32IFD-NEXT: addi sp, sp, -48 ; RV32IFD-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32IFD-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; RV32IFD-NEXT: lw a3, 0(a1) -; RV32IFD-NEXT: lw a4, 4(a1) -; RV32IFD-NEXT: lw a5, 8(a1) -; RV32IFD-NEXT: lw a6, 12(a1) ; RV32IFD-NEXT: mv s0, a0 +; RV32IFD-NEXT: lw a0, 0(a1) +; RV32IFD-NEXT: lw a2, 4(a1) +; RV32IFD-NEXT: lw a3, 8(a1) +; RV32IFD-NEXT: lw a1, 12(a1) +; RV32IFD-NEXT: sw a0, 0(sp) +; RV32IFD-NEXT: sw a2, 4(sp) +; RV32IFD-NEXT: sw a3, 8(sp) +; RV32IFD-NEXT: sw a1, 12(sp) ; RV32IFD-NEXT: addi a0, sp, 16 ; RV32IFD-NEXT: mv a1, sp ; RV32IFD-NEXT: addi a2, sp, 36 -; RV32IFD-NEXT: sw a3, 0(sp) -; RV32IFD-NEXT: sw a4, 4(sp) -; RV32IFD-NEXT: sw a5, 8(sp) -; RV32IFD-NEXT: sw a6, 12(sp) ; RV32IFD-NEXT: call frexpl ; RV32IFD-NEXT: lw a0, 16(sp) ; RV32IFD-NEXT: lw a1, 20(sp) @@ -1751,18 +1745,18 @@ define fp128 @test_frexp_f128_i32_only_use_fract(fp128 %a) nounwind { ; RV32IZFINXZDINX-NEXT: addi sp, sp, -48 ; RV32IZFINXZDINX-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32IZFINXZDINX-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: lw a3, 0(a1) -; RV32IZFINXZDINX-NEXT: lw a4, 4(a1) -; RV32IZFINXZDINX-NEXT: lw a5, 8(a1) -; RV32IZFINXZDINX-NEXT: lw a6, 12(a1) ; RV32IZFINXZDINX-NEXT: mv s0, a0 +; RV32IZFINXZDINX-NEXT: lw a0, 0(a1) +; RV32IZFINXZDINX-NEXT: lw a2, 4(a1) +; RV32IZFINXZDINX-NEXT: lw a3, 8(a1) +; RV32IZFINXZDINX-NEXT: lw a1, 12(a1) +; RV32IZFINXZDINX-NEXT: sw a0, 0(sp) +; RV32IZFINXZDINX-NEXT: sw a2, 4(sp) +; RV32IZFINXZDINX-NEXT: sw a3, 8(sp) +; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: addi a0, sp, 16 ; RV32IZFINXZDINX-NEXT: mv a1, sp ; RV32IZFINXZDINX-NEXT: addi a2, sp, 36 -; RV32IZFINXZDINX-NEXT: sw a3, 0(sp) -; RV32IZFINXZDINX-NEXT: sw a4, 4(sp) -; RV32IZFINXZDINX-NEXT: sw a5, 8(sp) -; RV32IZFINXZDINX-NEXT: sw a6, 12(sp) ; RV32IZFINXZDINX-NEXT: call frexpl ; RV32IZFINXZDINX-NEXT: lw a0, 16(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 20(sp) @@ -1792,18 +1786,18 @@ define fp128 @test_frexp_f128_i32_only_use_fract(fp128 %a) nounwind { ; RV32I-NEXT: addi sp, sp, -48 ; RV32I-NEXT: sw ra, 44(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) -; RV32I-NEXT: lw a5, 8(a1) -; RV32I-NEXT: lw a6, 12(a1) ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw a2, 4(a1) +; RV32I-NEXT: lw a3, 8(a1) +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: sw a0, 0(sp) +; RV32I-NEXT: sw a2, 4(sp) +; RV32I-NEXT: sw a3, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) ; RV32I-NEXT: addi a0, sp, 16 ; RV32I-NEXT: mv a1, sp ; RV32I-NEXT: addi a2, sp, 36 -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: call frexpl ; RV32I-NEXT: lw a0, 16(sp) ; RV32I-NEXT: lw a1, 20(sp) @@ -1837,17 +1831,17 @@ define i32 @test_frexp_f128_i32_only_use_exp(fp128 %a) nounwind { ; RV32IFD: # %bb.0: ; RV32IFD-NEXT: addi sp, sp, -48 ; RV32IFD-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32IFD-NEXT: lw a3, 0(a0) -; RV32IFD-NEXT: lw a4, 4(a0) -; RV32IFD-NEXT: lw a5, 8(a0) -; RV32IFD-NEXT: lw a6, 12(a0) +; RV32IFD-NEXT: lw a1, 0(a0) +; RV32IFD-NEXT: lw a2, 4(a0) +; RV32IFD-NEXT: lw a3, 8(a0) +; RV32IFD-NEXT: lw a0, 12(a0) +; RV32IFD-NEXT: sw a1, 8(sp) +; RV32IFD-NEXT: sw a2, 12(sp) +; RV32IFD-NEXT: sw a3, 16(sp) +; RV32IFD-NEXT: sw a0, 20(sp) ; RV32IFD-NEXT: addi a0, sp, 24 ; RV32IFD-NEXT: addi a1, sp, 8 ; RV32IFD-NEXT: addi a2, sp, 40 -; RV32IFD-NEXT: sw a3, 8(sp) -; RV32IFD-NEXT: sw a4, 12(sp) -; RV32IFD-NEXT: sw a5, 16(sp) -; RV32IFD-NEXT: sw a6, 20(sp) ; RV32IFD-NEXT: call frexpl ; RV32IFD-NEXT: lw a0, 40(sp) ; RV32IFD-NEXT: lw ra, 44(sp) # 4-byte Folded Reload @@ -1869,17 +1863,17 @@ define i32 @test_frexp_f128_i32_only_use_exp(fp128 %a) nounwind { ; RV32IZFINXZDINX: # %bb.0: ; RV32IZFINXZDINX-NEXT: addi sp, sp, -48 ; RV32IZFINXZDINX-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: lw a3, 0(a0) -; RV32IZFINXZDINX-NEXT: lw a4, 4(a0) -; RV32IZFINXZDINX-NEXT: lw a5, 8(a0) -; RV32IZFINXZDINX-NEXT: lw a6, 12(a0) +; RV32IZFINXZDINX-NEXT: lw a1, 0(a0) +; RV32IZFINXZDINX-NEXT: lw a2, 4(a0) +; RV32IZFINXZDINX-NEXT: lw a3, 8(a0) +; RV32IZFINXZDINX-NEXT: lw a0, 12(a0) +; RV32IZFINXZDINX-NEXT: sw a1, 8(sp) +; RV32IZFINXZDINX-NEXT: sw a2, 12(sp) +; RV32IZFINXZDINX-NEXT: sw a3, 16(sp) +; RV32IZFINXZDINX-NEXT: sw a0, 20(sp) ; RV32IZFINXZDINX-NEXT: addi a0, sp, 24 ; RV32IZFINXZDINX-NEXT: addi a1, sp, 8 ; RV32IZFINXZDINX-NEXT: addi a2, sp, 40 -; RV32IZFINXZDINX-NEXT: sw a3, 8(sp) -; RV32IZFINXZDINX-NEXT: sw a4, 12(sp) -; RV32IZFINXZDINX-NEXT: sw a5, 16(sp) -; RV32IZFINXZDINX-NEXT: sw a6, 20(sp) ; RV32IZFINXZDINX-NEXT: call frexpl ; RV32IZFINXZDINX-NEXT: lw a0, 40(sp) ; RV32IZFINXZDINX-NEXT: lw ra, 44(sp) # 4-byte Folded Reload @@ -1901,17 +1895,17 @@ define i32 @test_frexp_f128_i32_only_use_exp(fp128 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -48 ; RV32I-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a3, 0(a0) -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: lw a5, 8(a0) -; RV32I-NEXT: lw a6, 12(a0) +; RV32I-NEXT: lw a1, 0(a0) +; RV32I-NEXT: lw a2, 4(a0) +; RV32I-NEXT: lw a3, 8(a0) +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: sw a1, 8(sp) +; RV32I-NEXT: sw a2, 12(sp) +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: addi a0, sp, 24 ; RV32I-NEXT: addi a1, sp, 8 ; RV32I-NEXT: addi a2, sp, 40 -; RV32I-NEXT: sw a3, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) -; RV32I-NEXT: sw a5, 16(sp) -; RV32I-NEXT: sw a6, 20(sp) ; RV32I-NEXT: call frexpl ; RV32I-NEXT: lw a0, 40(sp) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/loop-strength-reduce-add-cheaper-than-mul.ll b/llvm/test/CodeGen/RISCV/loop-strength-reduce-add-cheaper-than-mul.ll index fa8ca071d2189..627f0005932a3 100644 --- a/llvm/test/CodeGen/RISCV/loop-strength-reduce-add-cheaper-than-mul.ll +++ b/llvm/test/CodeGen/RISCV/loop-strength-reduce-add-cheaper-than-mul.ll @@ -43,8 +43,8 @@ define void @test(i32 signext %i) nounwind { ; RV32-NEXT: .LBB0_2: # %bb ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: add a4, a2, a1 -; RV32-NEXT: add a1, a1, a0 ; RV32-NEXT: sb zero, 0(a4) +; RV32-NEXT: add a1, a1, a0 ; RV32-NEXT: blt a1, a3, .LBB0_2 ; RV32-NEXT: .LBB0_3: # %return ; RV32-NEXT: ret @@ -63,8 +63,8 @@ define void @test(i32 signext %i) nounwind { ; RV64-NEXT: slli a4, a1, 32 ; RV64-NEXT: srli a4, a4, 32 ; RV64-NEXT: add a4, a2, a4 -; RV64-NEXT: addw a1, a1, a0 ; RV64-NEXT: sb zero, 0(a4) +; RV64-NEXT: addw a1, a1, a0 ; RV64-NEXT: blt a1, a3, .LBB0_2 ; RV64-NEXT: .LBB0_3: # %return ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/machine-sink-load-immediate.ll b/llvm/test/CodeGen/RISCV/machine-sink-load-immediate.ll index eb84774014a4b..b3777668e20bd 100644 --- a/llvm/test/CodeGen/RISCV/machine-sink-load-immediate.ll +++ b/llvm/test/CodeGen/RISCV/machine-sink-load-immediate.ll @@ -319,8 +319,8 @@ define signext i32 @branch_dispatch(i8 %a) { ; CHECK-NEXT: li a1, 70 ; CHECK-NEXT: beq a0, a1, .LBB3_9 ; CHECK-NEXT: # %bb.3: # %case.3 -; CHECK-NEXT: li a1, 234 ; CHECK-NEXT: li s0, 23 +; CHECK-NEXT: li a1, 234 ; CHECK-NEXT: beq a0, a1, .LBB3_10 ; CHECK-NEXT: # %bb.4: # %case.4 ; CHECK-NEXT: beqz a0, .LBB3_11 diff --git a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll index 8deb17582cb11..ae9572328bd5d 100644 --- a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll +++ b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll @@ -57,29 +57,29 @@ ret: define void @test_la(i32 signext %n) { ; RV32I-LABEL: test_la: ; RV32I: # %bb.0: # %entry +; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: .Lpcrel_hi1: -; RV32I-NEXT: auipc a1, %got_pcrel_hi(g) -; RV32I-NEXT: lw a1, %pcrel_lo(.Lpcrel_hi1)(a1) -; RV32I-NEXT: li a2, 0 +; RV32I-NEXT: auipc a2, %got_pcrel_hi(g) +; RV32I-NEXT: lw a2, %pcrel_lo(.Lpcrel_hi1)(a2) ; RV32I-NEXT: .LBB1_1: # %loop ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: lw zero, 0(a1) -; RV32I-NEXT: addi a2, a2, 1 -; RV32I-NEXT: blt a2, a0, .LBB1_1 +; RV32I-NEXT: lw zero, 0(a2) +; RV32I-NEXT: addi a1, a1, 1 +; RV32I-NEXT: blt a1, a0, .LBB1_1 ; RV32I-NEXT: # %bb.2: # %ret ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_la: ; RV64I: # %bb.0: # %entry +; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: .Lpcrel_hi1: -; RV64I-NEXT: auipc a1, %got_pcrel_hi(g) -; RV64I-NEXT: ld a1, %pcrel_lo(.Lpcrel_hi1)(a1) -; RV64I-NEXT: li a2, 0 +; RV64I-NEXT: auipc a2, %got_pcrel_hi(g) +; RV64I-NEXT: ld a2, %pcrel_lo(.Lpcrel_hi1)(a2) ; RV64I-NEXT: .LBB1_1: # %loop ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: lw zero, 0(a1) -; RV64I-NEXT: addiw a2, a2, 1 -; RV64I-NEXT: blt a2, a0, .LBB1_1 +; RV64I-NEXT: lw zero, 0(a2) +; RV64I-NEXT: addiw a1, a1, 1 +; RV64I-NEXT: blt a1, a0, .LBB1_1 ; RV64I-NEXT: # %bb.2: # %ret ; RV64I-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll b/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll index d1b10af16063a..78b34452adef6 100644 --- a/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll +++ b/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll @@ -118,10 +118,9 @@ define void @test_regalloc_hint(i32 noundef signext %0, i32 noundef signext %1) ; ; FUSION-GENERIC-LABEL: test_regalloc_hint: ; FUSION-GENERIC: # %bb.0: -; FUSION-GENERIC-NEXT: lui a2, 3014 -; FUSION-GENERIC-NEXT: addiw a2, a2, 334 ; FUSION-GENERIC-NEXT: mv a0, a1 -; FUSION-GENERIC-NEXT: mv a1, a2 +; FUSION-GENERIC-NEXT: lui a1, 3014 +; FUSION-GENERIC-NEXT: addiw a1, a1, 334 ; FUSION-GENERIC-NEXT: tail bar tail call void @bar(i32 noundef signext %1, i32 noundef signext 12345678) ret void diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll index a9cb80cb66349..cbfb63785661a 100644 --- a/llvm/test/CodeGen/RISCV/mem.ll +++ b/llvm/test/CodeGen/RISCV/mem.ll @@ -123,8 +123,9 @@ define dso_local i32 @load_sext_zext_anyext_i1(ptr %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a1, 1(a0) ; RV32I-NEXT: lbu a2, 2(a0) +; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: lbu zero, 0(a0) -; RV32I-NEXT: sub a0, a2, a1 +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: ret ; sextload i1 %1 = getelementptr i1, ptr %a, i32 1 @@ -145,8 +146,9 @@ define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a1, 1(a0) ; RV32I-NEXT: lbu a2, 2(a0) +; RV32I-NEXT: sub a1, a2, a1 ; RV32I-NEXT: lbu zero, 0(a0) -; RV32I-NEXT: sub a0, a2, a1 +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: ret ; sextload i1 %1 = getelementptr i1, ptr %a, i32 1 diff --git a/llvm/test/CodeGen/RISCV/mem64.ll b/llvm/test/CodeGen/RISCV/mem64.ll index 248964146325a..254a1f85faa00 100644 --- a/llvm/test/CodeGen/RISCV/mem64.ll +++ b/llvm/test/CodeGen/RISCV/mem64.ll @@ -168,8 +168,9 @@ define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: lbu a1, 1(a0) ; RV64I-NEXT: lbu a2, 2(a0) +; RV64I-NEXT: sub a1, a2, a1 ; RV64I-NEXT: lbu zero, 0(a0) -; RV64I-NEXT: sub a0, a2, a1 +; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ret ; sextload i1 %1 = getelementptr i1, ptr %a, i32 1 @@ -190,8 +191,9 @@ define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: lbu a1, 1(a0) ; RV64I-NEXT: lbu a2, 2(a0) +; RV64I-NEXT: sub a1, a2, a1 ; RV64I-NEXT: lbu zero, 0(a0) -; RV64I-NEXT: sub a0, a2, a1 +; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ret ; sextload i1 %1 = getelementptr i1, ptr %a, i32 1 diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index f9086ba9d6354..6a63e80717623 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -2449,14 +2449,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV32-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a2, 2(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a0, 0(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a1, 0(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a2, a2, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a2, 0(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a0, 2(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a0, a0, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a2, 0(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a1, 2(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a1, a1, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a2, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 @@ -2466,14 +2466,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 2(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a0, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a1, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 16 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 2(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a0, a0, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 2(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a1, a1, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 @@ -2487,10 +2487,10 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV32-ZBKB: # %bb.0: # %entry ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a2, 0(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a3, 0(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a1, 2(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a0, a2, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a2, 0(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a1, 2(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a2, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 @@ -2500,14 +2500,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a2, 2(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a3, 2(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a2, a2, 16 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a3, a3, 16 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 2(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a0, a0, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 2(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a1, a1, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 @@ -2835,14 +2835,14 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 @@ -2872,10 +2872,10 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 @@ -3034,14 +3034,14 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a2, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a3, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 @@ -3077,10 +3077,10 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 @@ -4410,104 +4410,104 @@ entry: define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32: # %bb.0: # %entry -; CHECK-ALIGNED-RV32-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 3(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV32-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV32-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV32-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV32-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV32-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV32-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV32-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-NEXT: ret ; ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64: # %bb.0: # %entry -; CHECK-ALIGNED-RV64-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV64-NEXT: lb a4, 3(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV64-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV64-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV64-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV64-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV64-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV64-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV64-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV64-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV64-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV64-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV64-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-NEXT: ret ; ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 3(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a4, 3(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-ZBB-NEXT: ret @@ -4518,16 +4518,16 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a3, 1(a1) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a4, 2(a1) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a1, 3(a1) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a6, 2(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a7, 3(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a0, 0(a0) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a1, a4, a1 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a2, a2, a3 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a3, a6, a7 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a0, a0, a5 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a0, a5, a0 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a3, a3, a4 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a1, a2, a1 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a0, a0, a3 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a0, a3, a0 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret @@ -4538,72 +4538,72 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize { ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 1(a1) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a4, 2(a1) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lb a1, 3(a1) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a5, 0(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a6, 1(a0) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a2, a2, a3 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 2(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a5, 1(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a6, 2(a0) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a5, a5, a6 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a3, a3, a5 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a1, a1, 24 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a6, a6, 16 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a6 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry -; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 3(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-V-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-V-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-V-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-V-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-V-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV32-V-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-V-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-V-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-V-NEXT: ret ; ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry -; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lb a4, 3(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-V-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV64-V-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-V-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-V-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV64-V-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-V-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-V-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-V-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index f0290298e362a..ec83f16682296 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -3355,14 +3355,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV32-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a2, 2(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a0, 0(a0) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a1, 0(a1) -; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a2, a2, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 -; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a2, 0(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a0, 2(a0) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a0, a0, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a0, a2, a0 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lhu a2, 0(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: lbu a1, 2(a1) +; CHECK-UNALIGNED-RV32-ZBB-NEXT: slli a1, a1, 16 +; CHECK-UNALIGNED-RV32-ZBB-NEXT: or a1, a2, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV32-ZBB-NEXT: sltu a2, a0, a1 @@ -3372,14 +3372,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 2(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a0, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a1, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 16 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 2(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a0, a0, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 2(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a1, a1, 16 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: srli a0, a0, 32 @@ -3393,10 +3393,10 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV32-ZBKB: # %bb.0: # %entry ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a2, 0(a0) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a0, 2(a0) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a3, 0(a1) -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a1, 2(a1) ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a0, a2, a0 -; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lhu a2, 0(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: lbu a1, 2(a1) +; CHECK-UNALIGNED-RV32-ZBKB-NEXT: pack a1, a2, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT: sltu a2, a0, a1 @@ -3406,14 +3406,14 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_3: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a2, 2(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 0(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a3, 2(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a2, a2, 16 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a3, a3, 16 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 2(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a0, a0, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 2(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: slli a1, a1, 16 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: or a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: srli a0, a0, 32 @@ -3741,14 +3741,14 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a2, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a3, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lbu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 @@ -3778,10 +3778,10 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 @@ -3940,14 +3940,14 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a2, 4(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a0, 0(a0) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a3, 4(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a1, 0(a1) -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a2, a2, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a3, a3, 32 -; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a1, a3 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a0, 4(a0) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a0, a0, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a0, a2, a0 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lwu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: lhu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBB-NEXT: slli a1, a1, 32 +; CHECK-UNALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBB-NEXT: sltu a2, a0, a1 @@ -3983,10 +3983,10 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a2, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a0, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: rev8 a1, a1 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: sltu a2, a0, a1 @@ -5980,104 +5980,104 @@ entry: define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32: # %bb.0: # %entry -; CHECK-ALIGNED-RV32-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 3(a1) -; CHECK-ALIGNED-RV32-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV32-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV32-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV32-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV32-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV32-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV32-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV32-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV32-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV32-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV32-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-NEXT: ret ; ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64: # %bb.0: # %entry -; CHECK-ALIGNED-RV64-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV64-NEXT: lb a4, 3(a1) -; CHECK-ALIGNED-RV64-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV64-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV64-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV64-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV64-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV64-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV64-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV64-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV64-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV64-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV64-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV64-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV64-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV64-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-NEXT: ret ; ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 3(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-ZBB-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV32-ZBB-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-ZBB-NEXT: ret ; ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a4, 3(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-ZBB-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV64-ZBB-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-ZBB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-ZBB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-ZBB-NEXT: ret @@ -6088,16 +6088,16 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a3, 1(a1) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a4, 2(a1) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a1, 3(a1) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a6, 2(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a7, 3(a0) -; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a0, 0(a0) ; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a1, a4, a1 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a2, a2, a3 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a3, a6, a7 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a0, a0, a5 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a0, a5, a0 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: packh a3, a3, a4 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a1, a2, a1 -; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a0, a0, a3 +; CHECK-ALIGNED-RV32-ZBKB-NEXT: pack a0, a3, a0 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-ZBKB-NEXT: ret @@ -6108,72 +6108,72 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind { ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 1(a1) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a4, 2(a1) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lb a1, 3(a1) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a5, 0(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a6, 1(a0) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a2, a2, a3 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 2(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a5, 1(a0) +; CHECK-ALIGNED-RV64-ZBKB-NEXT: lbu a6, 2(a0) ; CHECK-ALIGNED-RV64-ZBKB-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a5, a5, a6 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: packh a3, a3, a5 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a4, a4, 16 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a1, a1, 24 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a1, a1, a4 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a3, a3, 16 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a6, a6, 16 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a3 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a6 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a1, a1, a2 -; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-ZBKB-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-ZBKB-NEXT: ret ; ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry -; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 3(a1) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV32-V-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV32-V-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 3(a0) -; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV32-V-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 3(a1) +; CHECK-ALIGNED-RV32-V-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV32-V-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV32-V-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 3(a0) +; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV32-V-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV32-V-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV32-V-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV32-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV32-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV32-V-NEXT: ret ; ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero: ; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry -; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 1(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 2(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lb a4, 3(a1) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a1, 0(a1) -; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 8 -; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 16 -; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 24 -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a2, a1 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 0(a0) -; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 1(a0) -; CHECK-ALIGNED-RV64-V-NEXT: or a3, a4, a3 -; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a0) -; CHECK-ALIGNED-RV64-V-NEXT: lb a0, 3(a0) -; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 8 -; CHECK-ALIGNED-RV64-V-NEXT: or a2, a5, a2 +; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 0(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 1(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a1) +; CHECK-ALIGNED-RV64-V-NEXT: lb a1, 3(a1) +; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 8 ; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16 +; CHECK-ALIGNED-RV64-V-NEXT: slli a1, a1, 24 +; CHECK-ALIGNED-RV64-V-NEXT: or a2, a3, a2 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a4 +; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 1(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 2(a0) +; CHECK-ALIGNED-RV64-V-NEXT: lb a0, 3(a0) +; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 8 +; CHECK-ALIGNED-RV64-V-NEXT: or a3, a4, a3 +; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 16 ; CHECK-ALIGNED-RV64-V-NEXT: slli a0, a0, 24 -; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a4 -; CHECK-ALIGNED-RV64-V-NEXT: or a1, a3, a1 -; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a2 +; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a5 +; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a2 +; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a3 ; CHECK-ALIGNED-RV64-V-NEXT: xor a0, a0, a1 ; CHECK-ALIGNED-RV64-V-NEXT: seqz a0, a0 ; CHECK-ALIGNED-RV64-V-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/memmove.ll b/llvm/test/CodeGen/RISCV/memmove.ll index 62915bd4ad99d..4795d2c6a5209 100644 --- a/llvm/test/CodeGen/RISCV/memmove.ll +++ b/llvm/test/CodeGen/RISCV/memmove.ll @@ -159,38 +159,38 @@ entry: define void @unaligned_memmove7(ptr nocapture %dest, ptr %src) nounwind { ; RV32-LABEL: unaligned_memmove7: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 4(a1) -; RV32-NEXT: lbu a3, 5(a1) -; RV32-NEXT: lbu a4, 6(a1) -; RV32-NEXT: lbu a5, 0(a1) -; RV32-NEXT: lbu a6, 1(a1) -; RV32-NEXT: lbu a7, 2(a1) -; RV32-NEXT: lbu a1, 3(a1) -; RV32-NEXT: sb a2, 4(a0) -; RV32-NEXT: sb a3, 5(a0) -; RV32-NEXT: sb a4, 6(a0) -; RV32-NEXT: sb a5, 0(a0) -; RV32-NEXT: sb a6, 1(a0) -; RV32-NEXT: sb a7, 2(a0) -; RV32-NEXT: sb a1, 3(a0) +; RV32-NEXT: lbu a2, 0(a1) +; RV32-NEXT: lbu a3, 1(a1) +; RV32-NEXT: lbu a4, 2(a1) +; RV32-NEXT: lbu a5, 3(a1) +; RV32-NEXT: lbu a6, 4(a1) +; RV32-NEXT: lbu a7, 5(a1) +; RV32-NEXT: lbu a1, 6(a1) +; RV32-NEXT: sb a6, 4(a0) +; RV32-NEXT: sb a7, 5(a0) +; RV32-NEXT: sb a1, 6(a0) +; RV32-NEXT: sb a2, 0(a0) +; RV32-NEXT: sb a3, 1(a0) +; RV32-NEXT: sb a4, 2(a0) +; RV32-NEXT: sb a5, 3(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: unaligned_memmove7: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 4(a1) -; RV64-NEXT: lbu a3, 5(a1) -; RV64-NEXT: lbu a4, 6(a1) -; RV64-NEXT: lbu a5, 0(a1) -; RV64-NEXT: lbu a6, 1(a1) -; RV64-NEXT: lbu a7, 2(a1) -; RV64-NEXT: lbu a1, 3(a1) -; RV64-NEXT: sb a2, 4(a0) -; RV64-NEXT: sb a3, 5(a0) -; RV64-NEXT: sb a4, 6(a0) -; RV64-NEXT: sb a5, 0(a0) -; RV64-NEXT: sb a6, 1(a0) -; RV64-NEXT: sb a7, 2(a0) -; RV64-NEXT: sb a1, 3(a0) +; RV64-NEXT: lbu a2, 0(a1) +; RV64-NEXT: lbu a3, 1(a1) +; RV64-NEXT: lbu a4, 2(a1) +; RV64-NEXT: lbu a5, 3(a1) +; RV64-NEXT: lbu a6, 4(a1) +; RV64-NEXT: lbu a7, 5(a1) +; RV64-NEXT: lbu a1, 6(a1) +; RV64-NEXT: sb a6, 4(a0) +; RV64-NEXT: sb a7, 5(a0) +; RV64-NEXT: sb a1, 6(a0) +; RV64-NEXT: sb a2, 0(a0) +; RV64-NEXT: sb a3, 1(a0) +; RV64-NEXT: sb a4, 2(a0) +; RV64-NEXT: sb a5, 3(a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: unaligned_memmove7: @@ -289,16 +289,16 @@ define void @unaligned_memmove15(ptr nocapture %dest, ptr %src) nounwind { ; ; RV32-FAST-LABEL: unaligned_memmove15: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lbu a2, 14(a1) -; RV32-FAST-NEXT: lw a3, 0(a1) -; RV32-FAST-NEXT: lw a4, 4(a1) -; RV32-FAST-NEXT: lw a5, 8(a1) -; RV32-FAST-NEXT: lh a1, 12(a1) -; RV32-FAST-NEXT: sb a2, 14(a0) -; RV32-FAST-NEXT: sw a3, 0(a0) -; RV32-FAST-NEXT: sw a4, 4(a0) -; RV32-FAST-NEXT: sw a5, 8(a0) -; RV32-FAST-NEXT: sh a1, 12(a0) +; RV32-FAST-NEXT: lw a2, 0(a1) +; RV32-FAST-NEXT: lw a3, 4(a1) +; RV32-FAST-NEXT: lw a4, 8(a1) +; RV32-FAST-NEXT: lh a5, 12(a1) +; RV32-FAST-NEXT: lbu a1, 14(a1) +; RV32-FAST-NEXT: sb a1, 14(a0) +; RV32-FAST-NEXT: sw a2, 0(a0) +; RV32-FAST-NEXT: sw a3, 4(a0) +; RV32-FAST-NEXT: sw a4, 8(a0) +; RV32-FAST-NEXT: sh a5, 12(a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memmove15: @@ -365,18 +365,18 @@ define void @unaligned_memmove31(ptr nocapture %dest, ptr %src) nounwind { ; ; RV64-FAST-LABEL: unaligned_memmove31: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: lh a2, 28(a1) -; RV64-FAST-NEXT: lbu a3, 30(a1) -; RV64-FAST-NEXT: ld a4, 0(a1) -; RV64-FAST-NEXT: ld a5, 8(a1) -; RV64-FAST-NEXT: ld a6, 16(a1) -; RV64-FAST-NEXT: lw a1, 24(a1) -; RV64-FAST-NEXT: sh a2, 28(a0) -; RV64-FAST-NEXT: sb a3, 30(a0) -; RV64-FAST-NEXT: sd a4, 0(a0) -; RV64-FAST-NEXT: sd a5, 8(a0) -; RV64-FAST-NEXT: sd a6, 16(a0) -; RV64-FAST-NEXT: sw a1, 24(a0) +; RV64-FAST-NEXT: ld a2, 0(a1) +; RV64-FAST-NEXT: ld a3, 8(a1) +; RV64-FAST-NEXT: ld a4, 16(a1) +; RV64-FAST-NEXT: lw a5, 24(a1) +; RV64-FAST-NEXT: lh a6, 28(a1) +; RV64-FAST-NEXT: lbu a1, 30(a1) +; RV64-FAST-NEXT: sh a6, 28(a0) +; RV64-FAST-NEXT: sb a1, 30(a0) +; RV64-FAST-NEXT: sd a2, 0(a0) +; RV64-FAST-NEXT: sd a3, 8(a0) +; RV64-FAST-NEXT: sd a4, 16(a0) +; RV64-FAST-NEXT: sw a5, 24(a0) ; RV64-FAST-NEXT: ret entry: tail call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false) @@ -579,18 +579,18 @@ define void @aligned_memmove31(ptr nocapture %dest, ptr %src) nounwind { ; ; RV64-BOTH-LABEL: aligned_memmove31: ; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: lh a2, 28(a1) -; RV64-BOTH-NEXT: lbu a3, 30(a1) -; RV64-BOTH-NEXT: ld a4, 0(a1) -; RV64-BOTH-NEXT: ld a5, 8(a1) -; RV64-BOTH-NEXT: ld a6, 16(a1) -; RV64-BOTH-NEXT: lw a1, 24(a1) -; RV64-BOTH-NEXT: sh a2, 28(a0) -; RV64-BOTH-NEXT: sb a3, 30(a0) -; RV64-BOTH-NEXT: sd a4, 0(a0) -; RV64-BOTH-NEXT: sd a5, 8(a0) -; RV64-BOTH-NEXT: sd a6, 16(a0) -; RV64-BOTH-NEXT: sw a1, 24(a0) +; RV64-BOTH-NEXT: ld a2, 0(a1) +; RV64-BOTH-NEXT: ld a3, 8(a1) +; RV64-BOTH-NEXT: ld a4, 16(a1) +; RV64-BOTH-NEXT: lw a5, 24(a1) +; RV64-BOTH-NEXT: lh a6, 28(a1) +; RV64-BOTH-NEXT: lbu a1, 30(a1) +; RV64-BOTH-NEXT: sh a6, 28(a0) +; RV64-BOTH-NEXT: sb a1, 30(a0) +; RV64-BOTH-NEXT: sd a2, 0(a0) +; RV64-BOTH-NEXT: sd a3, 8(a0) +; RV64-BOTH-NEXT: sd a4, 16(a0) +; RV64-BOTH-NEXT: sw a5, 24(a0) ; RV64-BOTH-NEXT: ret entry: tail call void @llvm.memmove.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false) diff --git a/llvm/test/CodeGen/RISCV/memset-pattern.ll b/llvm/test/CodeGen/RISCV/memset-pattern.ll index 35ce7fad0ea67..3b80c5684bfd0 100644 --- a/llvm/test/CodeGen/RISCV/memset-pattern.ll +++ b/llvm/test/CodeGen/RISCV/memset-pattern.ll @@ -15,24 +15,24 @@ define void @memset_1(ptr %a, i128 %value) nounwind { ; RV32-BOTH-LABEL: memset_1: ; RV32-BOTH: # %bb.0: # %loadstoreloop.preheader ; RV32-BOTH-NEXT: li a2, 0 -; RV32-BOTH-NEXT: lw a3, 0(a1) -; RV32-BOTH-NEXT: lw a4, 4(a1) -; RV32-BOTH-NEXT: lw a5, 8(a1) +; RV32-BOTH-NEXT: li a3, 0 +; RV32-BOTH-NEXT: lw a4, 0(a1) +; RV32-BOTH-NEXT: lw a5, 4(a1) +; RV32-BOTH-NEXT: lw a6, 8(a1) ; RV32-BOTH-NEXT: lw a1, 12(a1) -; RV32-BOTH-NEXT: li a6, 0 ; RV32-BOTH-NEXT: .LBB0_1: # %loadstoreloop ; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-BOTH-NEXT: slli a7, a2, 4 ; RV32-BOTH-NEXT: addi a2, a2, 1 ; RV32-BOTH-NEXT: add a7, a0, a7 -; RV32-BOTH-NEXT: seqz t0, a2 -; RV32-BOTH-NEXT: add a6, a6, t0 -; RV32-BOTH-NEXT: or t0, a2, a6 -; RV32-BOTH-NEXT: sw a3, 0(a7) -; RV32-BOTH-NEXT: sw a4, 4(a7) -; RV32-BOTH-NEXT: sw a5, 8(a7) +; RV32-BOTH-NEXT: sw a4, 0(a7) +; RV32-BOTH-NEXT: sw a5, 4(a7) +; RV32-BOTH-NEXT: sw a6, 8(a7) ; RV32-BOTH-NEXT: sw a1, 12(a7) -; RV32-BOTH-NEXT: beqz t0, .LBB0_1 +; RV32-BOTH-NEXT: seqz a7, a2 +; RV32-BOTH-NEXT: add a3, a3, a7 +; RV32-BOTH-NEXT: or a7, a2, a3 +; RV32-BOTH-NEXT: beqz a7, .LBB0_1 ; RV32-BOTH-NEXT: # %bb.2: # %split ; RV32-BOTH-NEXT: ret ; @@ -60,19 +60,18 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind { ; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: li a2, 0 ; RV32-NEXT: li a3, 0 -; RV32-NEXT: lw a4, 4(a1) -; RV32-NEXT: lw a5, 0(a1) +; RV32-NEXT: lw a4, 0(a1) +; RV32-NEXT: lw a5, 4(a1) ; RV32-NEXT: lw a6, 8(a1) ; RV32-NEXT: lw a1, 12(a1) -; RV32-NEXT: srli a7, a4, 24 -; RV32-NEXT: srli t0, a4, 16 -; RV32-NEXT: srli t1, a4, 8 -; RV32-NEXT: srli t2, a5, 24 -; RV32-NEXT: srli t3, a5, 16 -; RV32-NEXT: srli t4, a5, 8 +; RV32-NEXT: srli a7, a5, 24 +; RV32-NEXT: srli t0, a5, 16 +; RV32-NEXT: srli t1, a5, 8 +; RV32-NEXT: srli t2, a4, 24 +; RV32-NEXT: srli t3, a4, 16 +; RV32-NEXT: srli t4, a4, 8 ; RV32-NEXT: srli t5, a6, 24 ; RV32-NEXT: srli t6, a6, 16 ; RV32-NEXT: srli s0, a6, 8 @@ -84,12 +83,11 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind { ; RV32-NEXT: slli s4, a2, 4 ; RV32-NEXT: addi a2, a2, 1 ; RV32-NEXT: add s4, a0, s4 -; RV32-NEXT: seqz s5, a2 -; RV32-NEXT: sb a4, 4(s4) +; RV32-NEXT: sb a5, 4(s4) ; RV32-NEXT: sb t1, 5(s4) ; RV32-NEXT: sb t0, 6(s4) ; RV32-NEXT: sb a7, 7(s4) -; RV32-NEXT: sb a5, 0(s4) +; RV32-NEXT: sb a4, 0(s4) ; RV32-NEXT: sb t4, 1(s4) ; RV32-NEXT: sb t3, 2(s4) ; RV32-NEXT: sb t2, 3(s4) @@ -97,20 +95,20 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind { ; RV32-NEXT: sb s0, 9(s4) ; RV32-NEXT: sb t6, 10(s4) ; RV32-NEXT: sb t5, 11(s4) -; RV32-NEXT: add a3, a3, s5 -; RV32-NEXT: or s5, a2, a3 ; RV32-NEXT: sb a1, 12(s4) ; RV32-NEXT: sb s3, 13(s4) ; RV32-NEXT: sb s2, 14(s4) ; RV32-NEXT: sb s1, 15(s4) -; RV32-NEXT: beqz s5, .LBB1_1 +; RV32-NEXT: seqz s4, a2 +; RV32-NEXT: add a3, a3, s4 +; RV32-NEXT: or s4, a2, a3 +; RV32-NEXT: beqz s4, .LBB1_1 ; RV32-NEXT: # %bb.2: # %split ; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; @@ -165,24 +163,24 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind { ; RV32-FAST-LABEL: memset_1_noalign: ; RV32-FAST: # %bb.0: # %loadstoreloop.preheader ; RV32-FAST-NEXT: li a2, 0 -; RV32-FAST-NEXT: lw a3, 0(a1) -; RV32-FAST-NEXT: lw a4, 4(a1) -; RV32-FAST-NEXT: lw a5, 8(a1) +; RV32-FAST-NEXT: li a3, 0 +; RV32-FAST-NEXT: lw a4, 0(a1) +; RV32-FAST-NEXT: lw a5, 4(a1) +; RV32-FAST-NEXT: lw a6, 8(a1) ; RV32-FAST-NEXT: lw a1, 12(a1) -; RV32-FAST-NEXT: li a6, 0 ; RV32-FAST-NEXT: .LBB1_1: # %loadstoreloop ; RV32-FAST-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-FAST-NEXT: slli a7, a2, 4 ; RV32-FAST-NEXT: addi a2, a2, 1 ; RV32-FAST-NEXT: add a7, a0, a7 -; RV32-FAST-NEXT: seqz t0, a2 -; RV32-FAST-NEXT: add a6, a6, t0 -; RV32-FAST-NEXT: or t0, a2, a6 -; RV32-FAST-NEXT: sw a3, 0(a7) -; RV32-FAST-NEXT: sw a4, 4(a7) -; RV32-FAST-NEXT: sw a5, 8(a7) +; RV32-FAST-NEXT: sw a4, 0(a7) +; RV32-FAST-NEXT: sw a5, 4(a7) +; RV32-FAST-NEXT: sw a6, 8(a7) ; RV32-FAST-NEXT: sw a1, 12(a7) -; RV32-FAST-NEXT: beqz t0, .LBB1_1 +; RV32-FAST-NEXT: seqz a7, a2 +; RV32-FAST-NEXT: add a3, a3, a7 +; RV32-FAST-NEXT: or a7, a2, a3 +; RV32-FAST-NEXT: beqz a7, .LBB1_1 ; RV32-FAST-NEXT: # %bb.2: # %split ; RV32-FAST-NEXT: ret ; @@ -205,26 +203,26 @@ define void @memset_4(ptr %a, i128 %value) nounwind { ; RV32-BOTH-LABEL: memset_4: ; RV32-BOTH: # %bb.0: # %loadstoreloop.preheader ; RV32-BOTH-NEXT: li a2, 0 -; RV32-BOTH-NEXT: lw a3, 0(a1) -; RV32-BOTH-NEXT: lw a4, 4(a1) -; RV32-BOTH-NEXT: lw a5, 8(a1) +; RV32-BOTH-NEXT: li a3, 0 +; RV32-BOTH-NEXT: lw a4, 0(a1) +; RV32-BOTH-NEXT: lw a5, 4(a1) +; RV32-BOTH-NEXT: lw a6, 8(a1) ; RV32-BOTH-NEXT: lw a1, 12(a1) -; RV32-BOTH-NEXT: li a6, 0 ; RV32-BOTH-NEXT: .LBB2_1: # %loadstoreloop ; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-BOTH-NEXT: slli a7, a2, 4 ; RV32-BOTH-NEXT: addi a2, a2, 1 -; RV32-BOTH-NEXT: seqz t0, a2 -; RV32-BOTH-NEXT: sltiu t1, a2, 4 -; RV32-BOTH-NEXT: add a6, a6, t0 -; RV32-BOTH-NEXT: seqz t0, a6 -; RV32-BOTH-NEXT: and t0, t0, t1 ; RV32-BOTH-NEXT: add a7, a0, a7 -; RV32-BOTH-NEXT: sw a3, 0(a7) -; RV32-BOTH-NEXT: sw a4, 4(a7) -; RV32-BOTH-NEXT: sw a5, 8(a7) +; RV32-BOTH-NEXT: seqz t0, a2 +; RV32-BOTH-NEXT: sw a4, 0(a7) +; RV32-BOTH-NEXT: sw a5, 4(a7) +; RV32-BOTH-NEXT: sw a6, 8(a7) ; RV32-BOTH-NEXT: sw a1, 12(a7) -; RV32-BOTH-NEXT: bnez t0, .LBB2_1 +; RV32-BOTH-NEXT: add a3, a3, t0 +; RV32-BOTH-NEXT: seqz a7, a3 +; RV32-BOTH-NEXT: sltiu t0, a2, 4 +; RV32-BOTH-NEXT: and a7, a7, t0 +; RV32-BOTH-NEXT: bnez a7, .LBB2_1 ; RV32-BOTH-NEXT: # %bb.2: # %split ; RV32-BOTH-NEXT: ret ; @@ -250,28 +248,28 @@ define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind { ; RV32-BOTH-NEXT: beqz a4, .LBB3_5 ; RV32-BOTH-NEXT: # %bb.1: # %loadstoreloop.preheader ; RV32-BOTH-NEXT: li a4, 0 -; RV32-BOTH-NEXT: lw a5, 0(a1) -; RV32-BOTH-NEXT: lw a6, 4(a1) -; RV32-BOTH-NEXT: lw a7, 8(a1) +; RV32-BOTH-NEXT: li a5, 0 +; RV32-BOTH-NEXT: lw a6, 0(a1) +; RV32-BOTH-NEXT: lw a7, 4(a1) +; RV32-BOTH-NEXT: lw t0, 8(a1) ; RV32-BOTH-NEXT: lw a1, 12(a1) -; RV32-BOTH-NEXT: li t0, 0 ; RV32-BOTH-NEXT: j .LBB3_3 ; RV32-BOTH-NEXT: .LBB3_2: # %loadstoreloop ; RV32-BOTH-NEXT: # in Loop: Header=BB3_3 Depth=1 -; RV32-BOTH-NEXT: sltu t1, t0, a3 +; RV32-BOTH-NEXT: sltu t1, a5, a3 ; RV32-BOTH-NEXT: beqz t1, .LBB3_5 ; RV32-BOTH-NEXT: .LBB3_3: # %loadstoreloop ; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-BOTH-NEXT: slli t1, a4, 4 ; RV32-BOTH-NEXT: addi a4, a4, 1 -; RV32-BOTH-NEXT: seqz t2, a4 -; RV32-BOTH-NEXT: add t0, t0, t2 ; RV32-BOTH-NEXT: add t1, a0, t1 -; RV32-BOTH-NEXT: sw a5, 0(t1) -; RV32-BOTH-NEXT: sw a6, 4(t1) -; RV32-BOTH-NEXT: sw a7, 8(t1) +; RV32-BOTH-NEXT: sw a6, 0(t1) +; RV32-BOTH-NEXT: sw a7, 4(t1) +; RV32-BOTH-NEXT: sw t0, 8(t1) ; RV32-BOTH-NEXT: sw a1, 12(t1) -; RV32-BOTH-NEXT: bne t0, a3, .LBB3_2 +; RV32-BOTH-NEXT: seqz t1, a4 +; RV32-BOTH-NEXT: add a5, a5, t1 +; RV32-BOTH-NEXT: bne a5, a3, .LBB3_2 ; RV32-BOTH-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 ; RV32-BOTH-NEXT: sltu t1, a4, a2 ; RV32-BOTH-NEXT: bnez t1, .LBB3_3 diff --git a/llvm/test/CodeGen/RISCV/misched-postra-direction.mir b/llvm/test/CodeGen/RISCV/misched-postra-direction.mir index 2cca042bebee6..e4b934c3036ae 100644 --- a/llvm/test/CodeGen/RISCV/misched-postra-direction.mir +++ b/llvm/test/CodeGen/RISCV/misched-postra-direction.mir @@ -11,6 +11,19 @@ # RUN: -misched-dump-schedule-trace -misched-postra-direction=bidirectional \ # RUN: -o - %s 2>&1 | FileCheck --check-prefix=BIDIRECTIONAL %s +# RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -passes=postmisched \ +# RUN: -enable-post-misched -debug-only=machine-scheduler \ +# RUN: -misched-dump-schedule-trace -misched-postra-direction=topdown \ +# RUN: -o - %s 2>&1 | FileCheck --check-prefix=TOPDOWN %s +# RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -passes=postmisched \ +# RUN: -enable-post-misched -debug-only=machine-scheduler \ +# RUN: -misched-dump-schedule-trace -misched-postra-direction=bottomup \ +# RUN: -o - %s 2>&1 | FileCheck --check-prefix=BOTTOMUP %s +# RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -passes=postmisched \ +# RUN: -enable-post-misched -debug-only=machine-scheduler \ +# RUN: -misched-dump-schedule-trace -misched-postra-direction=bidirectional \ +# RUN: -o - %s 2>&1 | FileCheck --check-prefix=BIDIRECTIONAL %s + # REQUIRES: asserts --- diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index 548c7e1c6ea8c..39dca893bd428 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1298,34 +1298,34 @@ define i64 @muli64_m3840(i64 %a) nounwind { define i128 @muli128_m3840(i128 %a) nounwind { ; RV32I-LABEL: muli128_m3840: ; RV32I: # %bb.0: +; RV32I-NEXT: lw a6, 0(a1) ; RV32I-NEXT: lw a3, 4(a1) ; RV32I-NEXT: lw a2, 8(a1) -; RV32I-NEXT: lw a5, 0(a1) ; RV32I-NEXT: lw a4, 12(a1) ; RV32I-NEXT: srli a1, a3, 20 -; RV32I-NEXT: slli a6, a2, 12 +; RV32I-NEXT: slli a5, a2, 12 ; RV32I-NEXT: srli a7, a3, 24 ; RV32I-NEXT: slli t0, a2, 8 ; RV32I-NEXT: srli t1, a2, 20 -; RV32I-NEXT: or a1, a6, a1 -; RV32I-NEXT: slli a6, a4, 12 +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: slli a5, a4, 12 ; RV32I-NEXT: srli t2, a2, 24 ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: or a2, t0, a7 -; RV32I-NEXT: srli a7, a5, 20 -; RV32I-NEXT: or a6, a6, t1 +; RV32I-NEXT: srli a7, a6, 20 +; RV32I-NEXT: or a5, a5, t1 ; RV32I-NEXT: slli t0, a3, 12 ; RV32I-NEXT: or t1, a4, t2 -; RV32I-NEXT: srli t2, a5, 24 +; RV32I-NEXT: srli t2, a6, 24 ; RV32I-NEXT: slli t3, a3, 8 ; RV32I-NEXT: or a3, t0, a7 -; RV32I-NEXT: slli a4, a5, 12 -; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: slli a4, a6, 12 +; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: or t0, t3, t2 -; RV32I-NEXT: sltu t2, a2, a1 -; RV32I-NEXT: sub a6, t1, a6 -; RV32I-NEXT: sltu a7, a5, a4 -; RV32I-NEXT: sub a6, a6, t2 +; RV32I-NEXT: sltu a7, a2, a1 +; RV32I-NEXT: sub a5, t1, a5 +; RV32I-NEXT: sub a5, a5, a7 +; RV32I-NEXT: sltu a7, a6, a4 ; RV32I-NEXT: mv t1, a7 ; RV32I-NEXT: beq t0, a3, .LBB36_2 ; RV32I-NEXT: # %bb.1: @@ -1333,15 +1333,15 @@ define i128 @muli128_m3840(i128 %a) nounwind { ; RV32I-NEXT: .LBB36_2: ; RV32I-NEXT: sub a2, a2, a1 ; RV32I-NEXT: sub a1, t0, a3 -; RV32I-NEXT: sub a5, a5, a4 -; RV32I-NEXT: sltu a3, a2, t1 +; RV32I-NEXT: sub a3, a6, a4 +; RV32I-NEXT: sltu a4, a2, t1 ; RV32I-NEXT: sub a2, a2, t1 ; RV32I-NEXT: sub a1, a1, a7 -; RV32I-NEXT: sub a3, a6, a3 -; RV32I-NEXT: sw a5, 0(a0) +; RV32I-NEXT: sub a5, a5, a4 +; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a1, 4(a0) ; RV32I-NEXT: sw a2, 8(a0) -; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a5, 12(a0) ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli128_m3840: diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll index fe19a4fa8bbd8..a57acf5576cb7 100644 --- a/llvm/test/CodeGen/RISCV/neg-abs.ll +++ b/llvm/test/CodeGen/RISCV/neg-abs.ll @@ -162,35 +162,35 @@ define i32 @neg_abs32_multiuse(i32 %x, ptr %y) { ; RV32I: # %bb.0: ; RV32I-NEXT: srai a2, a0, 31 ; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: sub a2, a0, a2 -; RV32I-NEXT: neg a0, a2 -; RV32I-NEXT: sw a2, 0(a1) +; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: sw a0, 0(a1) +; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: neg_abs32_multiuse: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: neg a2, a0 -; RV32ZBB-NEXT: max a2, a0, a2 -; RV32ZBB-NEXT: neg a0, a2 -; RV32ZBB-NEXT: sw a2, 0(a1) +; RV32ZBB-NEXT: max a0, a0, a2 +; RV32ZBB-NEXT: sw a0, 0(a1) +; RV32ZBB-NEXT: neg a0, a0 ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: neg_abs32_multiuse: ; RV64I: # %bb.0: ; RV64I-NEXT: sraiw a2, a0, 31 ; RV64I-NEXT: xor a0, a0, a2 -; RV64I-NEXT: subw a2, a0, a2 -; RV64I-NEXT: negw a0, a2 -; RV64I-NEXT: sw a2, 0(a1) +; RV64I-NEXT: subw a0, a0, a2 +; RV64I-NEXT: sw a0, 0(a1) +; RV64I-NEXT: negw a0, a0 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: neg_abs32_multiuse: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sext.w a0, a0 ; RV64ZBB-NEXT: negw a2, a0 -; RV64ZBB-NEXT: max a2, a0, a2 -; RV64ZBB-NEXT: negw a0, a2 -; RV64ZBB-NEXT: sw a2, 0(a1) +; RV64ZBB-NEXT: max a0, a0, a2 +; RV64ZBB-NEXT: sw a0, 0(a1) +; RV64ZBB-NEXT: negw a0, a0 ; RV64ZBB-NEXT: ret %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true) store i32 %abs, ptr %y @@ -208,14 +208,12 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) { ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: .LBB5_2: -; RV32I-NEXT: snez a3, a0 -; RV32I-NEXT: neg a4, a1 -; RV32I-NEXT: sub a3, a4, a3 -; RV32I-NEXT: neg a4, a0 ; RV32I-NEXT: sw a0, 0(a2) ; RV32I-NEXT: sw a1, 4(a2) -; RV32I-NEXT: mv a0, a4 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: snez a2, a0 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: neg_abs64_multiuse: @@ -227,31 +225,29 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) { ; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: neg a0, a0 ; RV32ZBB-NEXT: .LBB5_2: -; RV32ZBB-NEXT: snez a3, a0 -; RV32ZBB-NEXT: neg a4, a1 -; RV32ZBB-NEXT: sub a3, a4, a3 -; RV32ZBB-NEXT: neg a4, a0 ; RV32ZBB-NEXT: sw a0, 0(a2) ; RV32ZBB-NEXT: sw a1, 4(a2) -; RV32ZBB-NEXT: mv a0, a4 -; RV32ZBB-NEXT: mv a1, a3 +; RV32ZBB-NEXT: snez a2, a0 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a1, a1, a2 +; RV32ZBB-NEXT: neg a0, a0 ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: neg_abs64_multiuse: ; RV64I: # %bb.0: ; RV64I-NEXT: srai a2, a0, 63 ; RV64I-NEXT: xor a0, a0, a2 -; RV64I-NEXT: sub a2, a0, a2 -; RV64I-NEXT: neg a0, a2 -; RV64I-NEXT: sd a2, 0(a1) +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: sd a0, 0(a1) +; RV64I-NEXT: neg a0, a0 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: neg_abs64_multiuse: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: neg a2, a0 -; RV64ZBB-NEXT: max a2, a0, a2 -; RV64ZBB-NEXT: neg a0, a2 -; RV64ZBB-NEXT: sd a2, 0(a1) +; RV64ZBB-NEXT: max a0, a0, a2 +; RV64ZBB-NEXT: sd a0, 0(a1) +; RV64ZBB-NEXT: neg a0, a0 ; RV64ZBB-NEXT: ret %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true) store i64 %abs, ptr %y diff --git a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll index 5ede992e844f1..ff9d7a009fc29 100644 --- a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll +++ b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll @@ -233,9 +233,9 @@ define i32 @orc_b_i32_sub_shl8x_x_b1_shl_used(i32 %x, ptr %arr) { ; RV32I-NEXT: addi a2, a2, 514 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: slli a2, a0, 7 -; RV32I-NEXT: srli a3, a0, 1 -; RV32I-NEXT: sub a0, a2, a3 -; RV32I-NEXT: sw a3, 0(a1) +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: sw a0, 0(a1) +; RV32I-NEXT: sub a0, a2, a0 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used: @@ -244,8 +244,8 @@ define i32 @orc_b_i32_sub_shl8x_x_b1_shl_used(i32 %x, ptr %arr) { ; RV32ZBB-NEXT: addi a2, a2, 514 ; RV32ZBB-NEXT: and a0, a0, a2 ; RV32ZBB-NEXT: srli a2, a0, 1 -; RV32ZBB-NEXT: orc.b a0, a0 ; RV32ZBB-NEXT: sw a2, 0(a1) +; RV32ZBB-NEXT: orc.b a0, a0 ; RV32ZBB-NEXT: ret entry: %and = and i32 %x, 33686018 @@ -264,8 +264,8 @@ define i32 @orc_b_i32_sub_shl8x_x_b1_srl_used(i32 %x, ptr %arr) { ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: slli a2, a0, 7 ; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: sub a0, a2, a0 ; RV32I-NEXT: sw a2, 0(a1) +; RV32I-NEXT: sub a0, a2, a0 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used: @@ -274,8 +274,8 @@ define i32 @orc_b_i32_sub_shl8x_x_b1_srl_used(i32 %x, ptr %arr) { ; RV32ZBB-NEXT: addi a2, a2, 514 ; RV32ZBB-NEXT: and a0, a0, a2 ; RV32ZBB-NEXT: slli a2, a0, 7 -; RV32ZBB-NEXT: orc.b a0, a0 ; RV32ZBB-NEXT: sw a2, 0(a1) +; RV32ZBB-NEXT: orc.b a0, a0 ; RV32ZBB-NEXT: ret entry: %and = and i32 %x, 33686018 @@ -320,8 +320,8 @@ define i32 @orc_b_i32_sub_shl8x_x_shl_used(i32 %x, ptr %arr){ ; CHECK-NEXT: addi a2, a2, 257 ; CHECK-NEXT: and a0, a0, a2 ; CHECK-NEXT: slli a2, a0, 8 -; CHECK-NEXT: sub a0, a2, a0 ; CHECK-NEXT: sw a2, 0(a1) +; CHECK-NEXT: sub a0, a2, a0 ; CHECK-NEXT: ret entry: %and = and i32 %x, 16843009 @@ -338,10 +338,10 @@ define i32 @orc_b_i32_sub_shl8x_x_b1_both_used(i32 %x, ptr %arr) { ; CHECK-NEXT: addi a2, a2, 514 ; CHECK-NEXT: and a0, a0, a2 ; CHECK-NEXT: slli a2, a0, 7 -; CHECK-NEXT: srli a3, a0, 1 -; CHECK-NEXT: sub a0, a2, a3 +; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: sw a2, 0(a1) -; CHECK-NEXT: sw a3, 4(a1) +; CHECK-NEXT: sw a0, 4(a1) +; CHECK-NEXT: sub a0, a2, a0 ; CHECK-NEXT: ret entry: %and = and i32 %x, 33686018 diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index 5a01d43fea56b..48ba11b260bda 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -373,12 +373,12 @@ define i64 @uaddo6_xor(i64 %a, i64 %b) { ; ; RV64-LABEL: uaddo6_xor: ; RV64: # %bb.0: -; RV64-NEXT: not a2, a0 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltu a2, a1, .LBB8_2 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: bltu a0, a1, .LBB8_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a0, 42 +; RV64-NEXT: li a1, 42 ; RV64-NEXT: .LBB8_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %x = xor i64 %a, -1 %cmp = icmp ult i64 %x, %b @@ -409,12 +409,12 @@ define i64 @uaddo6_xor_commuted(i64 %a, i64 %b) { ; ; RV64-LABEL: uaddo6_xor_commuted: ; RV64: # %bb.0: -; RV64-NEXT: not a2, a0 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltu a2, a1, .LBB9_2 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: bltu a0, a1, .LBB9_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a0, 42 +; RV64-NEXT: li a1, 42 ; RV64-NEXT: .LBB9_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %x = xor i64 %a, -1 %cmp = icmp ult i64 %x, %b @@ -436,8 +436,8 @@ define i64 @uaddo6_xor_multi_use(i64 %a, i64 %b) { ; RV32-NEXT: .cfi_offset s0, -8 ; RV32-NEXT: .cfi_offset s1, -12 ; RV32-NEXT: mv s0, a2 -; RV32-NEXT: not a1, a1 ; RV32-NEXT: not a0, a0 +; RV32-NEXT: not a1, a1 ; RV32-NEXT: beq a1, a3, .LBB10_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: sltu a2, a1, a3 @@ -472,8 +472,8 @@ define i64 @uaddo6_xor_multi_use(i64 %a, i64 %b) { ; RV64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: not a0, a0 ; RV64-NEXT: mv s0, a1 +; RV64-NEXT: not a0, a0 ; RV64-NEXT: bltu a0, a1, .LBB10_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li s0, 42 @@ -499,17 +499,17 @@ define i64 @uaddo6_xor_multi_use(i64 %a, i64 %b) { define i1 @uaddo6_xor_op_after_XOR(i32 %a, ptr %b.ptr) { ; RV32-LABEL: uaddo6_xor_op_after_XOR: ; RV32: # %bb.0: -; RV32-NEXT: lw a1, 0(a1) ; RV32-NEXT: not a0, a0 +; RV32-NEXT: lw a1, 0(a1) ; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: ret ; ; RV64-LABEL: uaddo6_xor_op_after_XOR: ; RV64: # %bb.0: -; RV64-NEXT: lw a1, 0(a1) ; RV64-NEXT: not a0, a0 ; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: lw a1, 0(a1) ; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: ret @@ -811,8 +811,8 @@ define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) { ; RV64-LABEL: usubo_ult_i64_math_overflow_used: ; RV64: # %bb.0: ; RV64-NEXT: sub a3, a0, a1 -; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: sd a3, 0(a2) +; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: ret %s = sub i64 %x, %y store i64 %s, ptr %p @@ -1080,33 +1080,33 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .cfi_offset s5, -28 ; RV32-NEXT: .cfi_offset s6, -32 ; RV32-NEXT: mv s5, a5 -; RV32-NEXT: mv s3, a1 -; RV32-NEXT: andi a1, a5, 1 -; RV32-NEXT: beqz a1, .LBB32_8 +; RV32-NEXT: mv s3, a0 +; RV32-NEXT: andi a0, a5, 1 +; RV32-NEXT: beqz a0, .LBB32_8 ; RV32-NEXT: # %bb.1: # %t ; RV32-NEXT: mv s0, a4 ; RV32-NEXT: mv s2, a3 ; RV32-NEXT: mv s1, a2 -; RV32-NEXT: mv s4, a0 -; RV32-NEXT: beq s3, a3, .LBB32_3 +; RV32-NEXT: mv s4, a1 +; RV32-NEXT: beq a1, a3, .LBB32_3 ; RV32-NEXT: # %bb.2: # %t -; RV32-NEXT: sltu s6, s3, s2 +; RV32-NEXT: sltu s6, s4, s2 ; RV32-NEXT: j .LBB32_4 ; RV32-NEXT: .LBB32_3: -; RV32-NEXT: sltu s6, s4, s1 +; RV32-NEXT: sltu s6, s3, s1 ; RV32-NEXT: .LBB32_4: # %t ; RV32-NEXT: mv a0, s6 ; RV32-NEXT: call call ; RV32-NEXT: beqz s6, .LBB32_8 ; RV32-NEXT: # %bb.5: # %end -; RV32-NEXT: sltu a1, s4, s1 +; RV32-NEXT: sltu a1, s3, s1 ; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beq s3, s2, .LBB32_7 +; RV32-NEXT: beq s4, s2, .LBB32_7 ; RV32-NEXT: # %bb.6: # %end -; RV32-NEXT: sltu a0, s3, s2 +; RV32-NEXT: sltu a0, s4, s2 ; RV32-NEXT: .LBB32_7: # %end -; RV32-NEXT: sub a2, s3, s2 -; RV32-NEXT: sub a3, s4, s1 +; RV32-NEXT: sub a2, s4, s2 +; RV32-NEXT: sub a3, s3, s1 ; RV32-NEXT: sub a2, a2, a1 ; RV32-NEXT: sw a3, 0(s0) ; RV32-NEXT: sw a2, 4(s0) @@ -1151,13 +1151,13 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV64-NEXT: .cfi_offset s3, -40 ; RV64-NEXT: .cfi_offset s4, -48 ; RV64-NEXT: mv s0, a3 -; RV64-NEXT: mv s2, a1 -; RV64-NEXT: andi a1, a3, 1 -; RV64-NEXT: beqz a1, .LBB32_3 +; RV64-NEXT: mv s3, a0 +; RV64-NEXT: andi a0, a3, 1 +; RV64-NEXT: beqz a0, .LBB32_3 ; RV64-NEXT: # %bb.1: # %t ; RV64-NEXT: mv s1, a2 -; RV64-NEXT: mv s3, a0 -; RV64-NEXT: sltu s4, a0, s2 +; RV64-NEXT: mv s2, a1 +; RV64-NEXT: sltu s4, s3, a1 ; RV64-NEXT: mv a0, s4 ; RV64-NEXT: call call ; RV64-NEXT: bgeu s3, s2, .LBB32_3 diff --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll index 8e858bdd29762..ccb57c442fbfa 100644 --- a/llvm/test/CodeGen/RISCV/pr51206.ll +++ b/llvm/test/CodeGen/RISCV/pr51206.ll @@ -13,21 +13,21 @@ define signext i32 @wobble() nounwind { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: lui a0, %hi(global) ; CHECK-NEXT: lui a1, %hi(global.1) -; CHECK-NEXT: lbu a0, %lo(global)(a0) ; CHECK-NEXT: lui a2, %hi(global.2) -; CHECK-NEXT: lui a3, 52429 -; CHECK-NEXT: lbu a2, %lo(global.2)(a2) +; CHECK-NEXT: lbu a0, %lo(global)(a0) ; CHECK-NEXT: addi a0, a0, 1 ; CHECK-NEXT: sw a0, %lo(global.1)(a1) -; CHECK-NEXT: lui a1, %hi(global.3) -; CHECK-NEXT: slli a3, a3, 4 +; CHECK-NEXT: lui a1, 52429 +; CHECK-NEXT: lbu a2, %lo(global.2)(a2) +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: mul a0, a0, a2 ; CHECK-NEXT: slli a2, a0, 48 -; CHECK-NEXT: mulhu a2, a2, a3 -; CHECK-NEXT: srli a2, a2, 18 -; CHECK-NEXT: li a3, 5 -; CHECK-NEXT: sw a2, %lo(global.3)(a1) -; CHECK-NEXT: bgeu a0, a3, .LBB0_2 +; CHECK-NEXT: mulhu a1, a2, a1 +; CHECK-NEXT: lui a2, %hi(global.3) +; CHECK-NEXT: srli a1, a1, 18 +; CHECK-NEXT: sw a1, %lo(global.3)(a2) +; CHECK-NEXT: li a1, 5 +; CHECK-NEXT: bgeu a0, a1, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %bb12 ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/pr58511.ll b/llvm/test/CodeGen/RISCV/pr58511.ll index e5cba679729fa..c06a5b1cf11fa 100644 --- a/llvm/test/CodeGen/RISCV/pr58511.ll +++ b/llvm/test/CodeGen/RISCV/pr58511.ll @@ -47,8 +47,8 @@ define i32 @h(i1 %0, i32 %1, ptr %2) { ; CHECK-NEXT: addiw a3, a3, -2047 ; CHECK-NEXT: srai a0, a0, 63 ; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: sw a1, 0(a2) +; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: ret BB: %I = select i1 %0, i32 -1, i32 0 @@ -66,8 +66,8 @@ define i32 @i(i1 %0, i32 %1, ptr %2) { ; CHECK-NEXT: addiw a3, a3, -2047 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: sw a1, 0(a2) +; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: ret BB: %I = select i1 %0, i32 0, i32 -1 diff --git a/llvm/test/CodeGen/RISCV/pr63816.ll b/llvm/test/CodeGen/RISCV/pr63816.ll index 75ddeda3de507..5632e8ec16224 100644 --- a/llvm/test/CodeGen/RISCV/pr63816.ll +++ b/llvm/test/CodeGen/RISCV/pr63816.ll @@ -47,12 +47,12 @@ define void @test(ptr %0, ptr %1) nounwind { ; CHECK-NEXT: fcvt.d.s fs6, fa0 ; CHECK-NEXT: fcvt.d.s fs5, fs5 ; CHECK-NEXT: fcvt.d.s fs4, fs4 -; CHECK-NEXT: lhu a0, 14(s1) ; CHECK-NEXT: fcvt.d.s fs3, fs3 ; CHECK-NEXT: fcvt.d.s fs2, fs2 ; CHECK-NEXT: fcvt.d.s fs1, fs1 -; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: fcvt.d.s fs0, fs0 +; CHECK-NEXT: lhu a0, 14(s1) +; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: call __extendhfsf2 ; CHECK-NEXT: fcvt.d.s fa5, fa0 ; CHECK-NEXT: fsd fs2, 32(s0) diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll index 9fc9a3c42867e..55c198aeb98b0 100644 --- a/llvm/test/CodeGen/RISCV/pr69586.ll +++ b/llvm/test/CodeGen/RISCV/pr69586.ll @@ -7,21 +7,21 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-LABEL: test: ; NOREMAT: # %bb.0: -; NOREMAT-NEXT: addi sp, sp, -752 -; NOREMAT-NEXT: .cfi_def_cfa_offset 752 -; NOREMAT-NEXT: sd ra, 744(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s0, 736(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s1, 728(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s2, 720(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s3, 712(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s4, 704(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s5, 696(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s6, 688(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s7, 680(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s8, 672(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s9, 664(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s10, 656(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sd s11, 648(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addi sp, sp, -720 +; NOREMAT-NEXT: .cfi_def_cfa_offset 720 +; NOREMAT-NEXT: sd ra, 712(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s0, 704(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s1, 696(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s2, 688(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s3, 680(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s4, 672(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s5, 664(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s6, 656(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s7, 648(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s8, 640(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s9, 632(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s10, 624(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd s11, 616(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: .cfi_offset ra, -8 ; NOREMAT-NEXT: .cfi_offset s0, -16 ; NOREMAT-NEXT: .cfi_offset s1, -24 @@ -35,608 +35,597 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: .cfi_offset s9, -88 ; NOREMAT-NEXT: .cfi_offset s10, -96 ; NOREMAT-NEXT: .cfi_offset s11, -104 -; NOREMAT-NEXT: csrr a2, vlenb -; NOREMAT-NEXT: slli a2, a2, 1 -; NOREMAT-NEXT: sub sp, sp, a2 -; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb ; NOREMAT-NEXT: mv a7, a0 ; NOREMAT-NEXT: li a0, 32 -; NOREMAT-NEXT: addi a5, a7, 512 +; NOREMAT-NEXT: addi a6, a7, 512 ; NOREMAT-NEXT: addi a4, a7, 1024 -; NOREMAT-NEXT: addi a6, a7, 1536 -; NOREMAT-NEXT: li t4, 1 -; NOREMAT-NEXT: li a2, 5 +; NOREMAT-NEXT: addi a5, a7, 1536 +; NOREMAT-NEXT: li t0, 1 +; NOREMAT-NEXT: li a3, 5 ; NOREMAT-NEXT: li t1, 3 -; NOREMAT-NEXT: li t0, 7 -; NOREMAT-NEXT: lui t5, 1 -; NOREMAT-NEXT: li s4, 9 -; NOREMAT-NEXT: li s6, 11 -; NOREMAT-NEXT: li s9, 13 -; NOREMAT-NEXT: li ra, 15 -; NOREMAT-NEXT: lui t2, 2 -; NOREMAT-NEXT: lui s1, 3 -; NOREMAT-NEXT: lui t3, 4 -; NOREMAT-NEXT: lui s0, 5 -; NOREMAT-NEXT: lui s3, 6 -; NOREMAT-NEXT: lui s7, 7 -; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; NOREMAT-NEXT: slli t4, t4, 11 -; NOREMAT-NEXT: sd t4, 512(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli a3, a2, 9 -; NOREMAT-NEXT: sd a3, 504(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli t6, t1, 10 -; NOREMAT-NEXT: slli s2, t0, 9 -; NOREMAT-NEXT: add a0, a7, t5 -; NOREMAT-NEXT: lui s11, 1 -; NOREMAT-NEXT: slli s4, s4, 9 -; NOREMAT-NEXT: slli s5, a2, 10 -; NOREMAT-NEXT: slli s6, s6, 9 -; NOREMAT-NEXT: slli s8, t1, 11 -; NOREMAT-NEXT: vle32.v v8, (a5) -; NOREMAT-NEXT: slli s9, s9, 9 +; NOREMAT-NEXT: li a2, 7 +; NOREMAT-NEXT: lui t4, 1 +; NOREMAT-NEXT: li s8, 9 +; NOREMAT-NEXT: li s10, 11 ; NOREMAT-NEXT: li t5, 13 -; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: slli s10, t0, 10 -; NOREMAT-NEXT: vle32.v v0, (a6) -; NOREMAT-NEXT: vle32.v v12, (a6) -; NOREMAT-NEXT: slli ra, ra, 9 +; NOREMAT-NEXT: lui s1, 2 +; NOREMAT-NEXT: lui t3, 3 +; NOREMAT-NEXT: lui s3, 4 +; NOREMAT-NEXT: lui s11, 5 +; NOREMAT-NEXT: lui t2, 6 +; NOREMAT-NEXT: lui t6, 7 +; NOREMAT-NEXT: lui s5, 8 +; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOREMAT-NEXT: slli t0, t0, 11 +; NOREMAT-NEXT: slli s0, a3, 9 +; NOREMAT-NEXT: slli s4, t1, 10 +; NOREMAT-NEXT: slli s6, a2, 9 +; NOREMAT-NEXT: add a0, a7, t4 +; NOREMAT-NEXT: slli s8, s8, 9 +; NOREMAT-NEXT: slli s9, a3, 10 +; NOREMAT-NEXT: vle32.v v10, (a6) +; NOREMAT-NEXT: slli s10, s10, 9 +; NOREMAT-NEXT: slli ra, t1, 11 +; NOREMAT-NEXT: vle32.v v14, (a4) +; NOREMAT-NEXT: vle32.v v8, (a4) +; NOREMAT-NEXT: slli t5, t5, 9 +; NOREMAT-NEXT: vle32.v v0, (a5) +; NOREMAT-NEXT: vle32.v v12, (a5) +; NOREMAT-NEXT: add a4, a7, s1 +; NOREMAT-NEXT: vle32.v v2, (a0) ; NOREMAT-NEXT: vle32.v v4, (a0) -; NOREMAT-NEXT: vle32.v v20, (a0) -; NOREMAT-NEXT: add a4, a7, t2 +; NOREMAT-NEXT: add a5, a7, t3 ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: add a4, a7, s1 -; NOREMAT-NEXT: vle32.v v28, (a4) -; NOREMAT-NEXT: vle32.v v26, (a4) -; NOREMAT-NEXT: add a4, a7, t3 +; NOREMAT-NEXT: add a4, a7, s3 +; NOREMAT-NEXT: vle32.v v28, (a5) +; NOREMAT-NEXT: vle32.v v26, (a5) +; NOREMAT-NEXT: add a5, a7, s11 ; NOREMAT-NEXT: vle32.v v24, (a4) ; NOREMAT-NEXT: vle32.v v22, (a4) -; NOREMAT-NEXT: add a4, a7, s0 -; NOREMAT-NEXT: vle32.v v14, (a7) -; NOREMAT-NEXT: vle32.v v18, (a4) +; NOREMAT-NEXT: add a4, a7, t2 +; NOREMAT-NEXT: vle32.v v20, (a5) +; NOREMAT-NEXT: vle32.v v18, (a5) +; NOREMAT-NEXT: add a5, a7, t6 +; NOREMAT-NEXT: vle32.v v16, (a7) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v10 ; NOREMAT-NEXT: vle32.v v16, (a4) -; NOREMAT-NEXT: add a4, a7, s3 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v14 ; NOREMAT-NEXT: vle32.v v14, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10 -; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: addi a0, sp, 640 -; NOREMAT-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; NOREMAT-NEXT: add a4, a7, t4 -; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 -; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a7, a3 -; NOREMAT-NEXT: vle32.v v0, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10 -; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a7, t6 -; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 -; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a7, s2 -; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 -; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s7 -; NOREMAT-NEXT: vle32.v v0, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8 -; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a7, s4 -; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 -; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: add a4, a7, s5 -; NOREMAT-NEXT: vle32.v v4, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8 -; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, s6 -; NOREMAT-NEXT: vle32.v v20, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 +; NOREMAT-NEXT: vle32.v v10, (a5) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v0 +; NOREMAT-NEXT: vle32.v v8, (a5) +; NOREMAT-NEXT: add a5, a7, t0 +; NOREMAT-NEXT: mv t3, t0 +; NOREMAT-NEXT: vle32.v v0, (a5) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 +; NOREMAT-NEXT: vle32.v v12, (a5) +; NOREMAT-NEXT: add a5, a7, s0 +; NOREMAT-NEXT: vle32.v v0, (a5) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 +; NOREMAT-NEXT: vle32.v v12, (a5) +; NOREMAT-NEXT: add a5, a7, s4 +; NOREMAT-NEXT: vle32.v v0, (a5) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 +; NOREMAT-NEXT: vle32.v v12, (a5) +; NOREMAT-NEXT: add a5, a7, s6 +; NOREMAT-NEXT: vle32.v v0, (a5) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 ; NOREMAT-NEXT: vle32.v v12, (a4) +; NOREMAT-NEXT: slli a6, a2, 10 +; NOREMAT-NEXT: sd a6, 608(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: vle32.v v0, (a5) ; NOREMAT-NEXT: add a4, a7, s8 -; NOREMAT-NEXT: vle32.v v4, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 -; NOREMAT-NEXT: vle32.v v8, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v2 +; NOREMAT-NEXT: vle32.v v2, (a4) +; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: add a4, a7, s9 -; NOREMAT-NEXT: vle32.v v20, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 -; NOREMAT-NEXT: vle32.v v12, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v2 +; NOREMAT-NEXT: vle32.v v4, (a4) +; NOREMAT-NEXT: vle32.v v2, (a4) ; NOREMAT-NEXT: add a4, a7, s10 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v4 ; NOREMAT-NEXT: vle32.v v4, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 -; NOREMAT-NEXT: vle32.v v8, (a4) +; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: add a4, a7, ra +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v4 +; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 -; NOREMAT-NEXT: lui t4, 8 -; NOREMAT-NEXT: add a5, a7, t4 -; NOREMAT-NEXT: vle32.v v20, (a5) -; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2 +; NOREMAT-NEXT: add a4, a7, t5 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v4 +; NOREMAT-NEXT: vle32.v v4, (a4) +; NOREMAT-NEXT: vle32.v v0, (a4) +; NOREMAT-NEXT: add a4, a7, a6 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v4 +; NOREMAT-NEXT: vle32.v v4, (a4) +; NOREMAT-NEXT: vle32.v v2, (a4) +; NOREMAT-NEXT: li a5, 15 +; NOREMAT-NEXT: slli a4, a5, 9 +; NOREMAT-NEXT: sd a4, 600(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v4 +; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: vle32.v v4, (a4) +; NOREMAT-NEXT: vle32.v v0, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v4 ; NOREMAT-NEXT: li a4, 17 ; NOREMAT-NEXT: slli a4, a4, 9 -; NOREMAT-NEXT: li s1, 17 -; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: li t4, 17 +; NOREMAT-NEXT: sd a4, 592(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a4, a7, a4 -; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v4, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6 -; NOREMAT-NEXT: li a5, 9 -; NOREMAT-NEXT: slli a4, a5, 10 -; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: vle32.v v2, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v6 +; NOREMAT-NEXT: li t1, 9 +; NOREMAT-NEXT: slli a4, t1, 10 +; NOREMAT-NEXT: sd a4, 584(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a4, a7, a4 -; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: vle32.v v6, (a4) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 +; NOREMAT-NEXT: vle32.v v0, (a4) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v4 ; NOREMAT-NEXT: li a4, 19 ; NOREMAT-NEXT: slli a4, a4, 9 -; NOREMAT-NEXT: li t2, 19 -; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: li s1, 19 +; NOREMAT-NEXT: sd a4, 576(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a4, a7, a4 -; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: slli a3, a2, 11 -; NOREMAT-NEXT: sd a3, 600(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 +; NOREMAT-NEXT: vle32.v v4, (a4) +; NOREMAT-NEXT: slli a3, a3, 11 +; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v6 ; NOREMAT-NEXT: add a3, a7, a3 -; NOREMAT-NEXT: vle32.v v12, (a3) -; NOREMAT-NEXT: vle32.v v4, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 +; NOREMAT-NEXT: vle32.v v6, (a3) +; NOREMAT-NEXT: vle32.v v2, (a3) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v30 ; NOREMAT-NEXT: li s7, 21 ; NOREMAT-NEXT: slli a3, s7, 9 -; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 -; NOREMAT-NEXT: vle32.v v8, (a3) -; NOREMAT-NEXT: vle32.v v6, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: li a6, 11 -; NOREMAT-NEXT: slli a3, a6, 10 -; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a3, a7, a3 -; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 -; NOREMAT-NEXT: li s3, 23 -; NOREMAT-NEXT: slli a3, s3, 9 -; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: vle32.v v0, (a3) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v6 +; NOREMAT-NEXT: li a4, 11 +; NOREMAT-NEXT: slli a3, a4, 10 +; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a3, a7, a3 -; NOREMAT-NEXT: vle32.v v8, (a3) +; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 -; NOREMAT-NEXT: li s0, 25 -; NOREMAT-NEXT: slli a3, s0, 9 -; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v30 +; NOREMAT-NEXT: li s2, 23 +; NOREMAT-NEXT: slli a3, s2, 9 +; NOREMAT-NEXT: sd a3, 544(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: vle32.v v30, (a3) +; NOREMAT-NEXT: vle32.v v2, (a3) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v6 +; NOREMAT-NEXT: li t6, 25 +; NOREMAT-NEXT: slli a3, t6, 9 +; NOREMAT-NEXT: sd a3, 536(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a3, a7, a3 -; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 -; NOREMAT-NEXT: slli a3, t5, 10 -; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: vle32.v v0, (a3) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v30 +; NOREMAT-NEXT: li a6, 13 +; NOREMAT-NEXT: slli a3, a6, 10 +; NOREMAT-NEXT: sd a3, 528(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a3, a7, a3 -; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28 -; NOREMAT-NEXT: li t3, 27 -; NOREMAT-NEXT: slli a3, t3, 9 -; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: vle32.v v4, (a3) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v28 +; NOREMAT-NEXT: li t2, 27 +; NOREMAT-NEXT: slli a3, t2, 9 +; NOREMAT-NEXT: sd a3, 520(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a3, a7, a3 ; NOREMAT-NEXT: vle32.v v28, (a3) -; NOREMAT-NEXT: vle32.v v4, (a3) -; NOREMAT-NEXT: slli a2, t0, 11 -; NOREMAT-NEXT: sd a2, 544(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 +; NOREMAT-NEXT: vle32.v v2, (a3) +; NOREMAT-NEXT: slli a2, a2, 11 +; NOREMAT-NEXT: sd a2, 512(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v6 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 -; NOREMAT-NEXT: li t0, 29 -; NOREMAT-NEXT: slli a2, t0, 9 -; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28 -; NOREMAT-NEXT: li a3, 15 -; NOREMAT-NEXT: slli a2, a3, 10 -; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v30 +; NOREMAT-NEXT: li a3, 29 +; NOREMAT-NEXT: slli a2, a3, 9 +; NOREMAT-NEXT: sd a2, 504(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 -; NOREMAT-NEXT: li t1, 31 -; NOREMAT-NEXT: slli a2, t1, 9 -; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: vle32.v v0, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28 +; NOREMAT-NEXT: slli a2, a5, 10 +; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: li t0, 15 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v12, (a2) +; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8 -; NOREMAT-NEXT: lui a4, 4 -; NOREMAT-NEXT: addiw a0, a4, 512 -; NOREMAT-NEXT: sd a0, 496(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v26 +; NOREMAT-NEXT: li a5, 31 +; NOREMAT-NEXT: slli a0, a5, 9 +; NOREMAT-NEXT: sd a0, 488(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a0, a7, a0 -; NOREMAT-NEXT: vle32.v v8, (a0) ; NOREMAT-NEXT: vle32.v v26, (a0) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28 -; NOREMAT-NEXT: slli a2, s1, 10 -; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: vle32.v v6, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: addiw a2, a4, 1536 +; NOREMAT-NEXT: vle32.v v2, (a0) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v30 +; NOREMAT-NEXT: addiw a2, s3, 512 ; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: slli a2, a5, 11 +; NOREMAT-NEXT: vle32.v v6, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v28 +; NOREMAT-NEXT: slli a2, t4, 10 ; NOREMAT-NEXT: sd a2, 472(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v24, (a2) -; NOREMAT-NEXT: vle32.v v4, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8 -; NOREMAT-NEXT: lui a5, 5 -; NOREMAT-NEXT: addiw a2, a5, -1536 +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: vle32.v v0, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26 +; NOREMAT-NEXT: addiw a2, s3, 1536 ; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v8, (a2) -; NOREMAT-NEXT: vle32.v v22, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28 -; NOREMAT-NEXT: slli a2, t2, 10 +; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: vle32.v v4, (a2) +; NOREMAT-NEXT: slli a2, t1, 11 ; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: li t2, 19 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v24 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v26, (a2) -; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 -; NOREMAT-NEXT: addiw a2, a5, -512 +; NOREMAT-NEXT: vle32.v v24, (a2) +; NOREMAT-NEXT: vle32.v v2, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v30 +; NOREMAT-NEXT: addiw a2, s11, -1536 ; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v12, (a2) -; NOREMAT-NEXT: vle32.v v6, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24 -; NOREMAT-NEXT: addiw a2, a5, 512 +; NOREMAT-NEXT: vle32.v v22, (a2) +; NOREMAT-NEXT: vle32.v v30, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28 +; NOREMAT-NEXT: slli a2, s1, 10 ; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v24, (a2) -; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: slli a2, s7, 10 +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: vle32.v v6, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v26 +; NOREMAT-NEXT: addiw a2, s11, -512 ; NOREMAT-NEXT: sd a2, 432(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v8, (a2) -; NOREMAT-NEXT: vle32.v v4, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26 -; NOREMAT-NEXT: addiw a2, a5, 1536 +; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: vle32.v v0, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24 +; NOREMAT-NEXT: addiw a2, s11, 512 ; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v22, (a2) -; NOREMAT-NEXT: vle32.v v26, (a2) -; NOREMAT-NEXT: slli a2, a6, 11 +; NOREMAT-NEXT: vle32.v v24, (a2) +; NOREMAT-NEXT: vle32.v v4, (a2) +; NOREMAT-NEXT: slli a2, s7, 10 ; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v22 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v12, (a2) -; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18 -; NOREMAT-NEXT: lui a6, 6 -; NOREMAT-NEXT: addiw a2, a6, -1536 +; NOREMAT-NEXT: vle32.v v22, (a2) +; NOREMAT-NEXT: vle32.v v2, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28 +; NOREMAT-NEXT: addiw a2, s11, 1536 ; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v18, (a2) -; NOREMAT-NEXT: vle32.v v6, (a2) -; NOREMAT-NEXT: slli a2, s3, 10 +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: vle32.v v30, (a2) +; NOREMAT-NEXT: slli a2, a4, 11 ; NOREMAT-NEXT: sd a2, 400(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v26 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v16, (a2) -; NOREMAT-NEXT: vle32.v v24, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 -; NOREMAT-NEXT: addiw a2, a6, -512 +; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: vle32.v v6, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v20 +; NOREMAT-NEXT: lui a4, 6 +; NOREMAT-NEXT: addiw a2, a4, -1536 ; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v8, (a2) -; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22 -; NOREMAT-NEXT: addiw a2, a6, 512 +; NOREMAT-NEXT: vle32.v v20, (a2) +; NOREMAT-NEXT: vle32.v v0, (a2) +; NOREMAT-NEXT: slli a2, s2, 10 ; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v18, v24 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v22, (a2) -; NOREMAT-NEXT: vle32.v v4, (a2) -; NOREMAT-NEXT: slli a2, s0, 10 +; NOREMAT-NEXT: vle32.v v18, (a2) +; NOREMAT-NEXT: vle32.v v24, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22 +; NOREMAT-NEXT: addiw a2, a4, -512 ; NOREMAT-NEXT: sd a2, 376(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v26, (a2) -; NOREMAT-NEXT: vle32.v v2, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18 -; NOREMAT-NEXT: addiw a2, a6, 1536 +; NOREMAT-NEXT: vle32.v v22, (a2) +; NOREMAT-NEXT: vle32.v v4, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v28 +; NOREMAT-NEXT: addiw a2, a4, 512 ; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: slli a2, t5, 11 +; NOREMAT-NEXT: vle32.v v2, (a2) +; NOREMAT-NEXT: slli a2, t6, 10 ; NOREMAT-NEXT: sd a2, 360(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v16 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v26 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v16, (a2) -; NOREMAT-NEXT: vle32.v v6, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8 -; NOREMAT-NEXT: lui s0, 7 -; NOREMAT-NEXT: addiw a2, s0, -1536 +; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: vle32.v v30, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v20 +; NOREMAT-NEXT: addiw a2, a4, 1536 ; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v8, (a2) -; NOREMAT-NEXT: vle32.v v24, (a2) -; NOREMAT-NEXT: slli a2, t3, 10 +; NOREMAT-NEXT: vle32.v v20, (a2) +; NOREMAT-NEXT: vle32.v v6, (a2) +; NOREMAT-NEXT: slli a2, a6, 11 ; NOREMAT-NEXT: sd a2, 344(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v14 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v18 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v14, (a2) -; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addi a0, sp, 640 -; NOREMAT-NEXT: vl2r.v v12, (a0) # Unknown-size Folded Reload -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22 -; NOREMAT-NEXT: addiw a2, s0, -512 +; NOREMAT-NEXT: vle32.v v18, (a2) +; NOREMAT-NEXT: vle32.v v0, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v22 +; NOREMAT-NEXT: lui a6, 7 +; NOREMAT-NEXT: addiw a2, a6, -1536 ; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) -; NOREMAT-NEXT: vle32.v v12, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26 -; NOREMAT-NEXT: addiw a2, s0, 512 +; NOREMAT-NEXT: vle32.v v24, (a2) +; NOREMAT-NEXT: slli a2, t2, 10 ; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: lui t3, 7 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v16 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) -; NOREMAT-NEXT: slli a2, t0, 10 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v28 +; NOREMAT-NEXT: addiw a2, a6, -512 ; NOREMAT-NEXT: sd a2, 320(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v18 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v18, (a2) -; NOREMAT-NEXT: vle32.v v2, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16 -; NOREMAT-NEXT: addiw a2, t3, 1536 +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: vle32.v v14, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v26 +; NOREMAT-NEXT: addiw a2, a6, 512 ; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v16, (a2) -; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: slli a2, a3, 11 +; NOREMAT-NEXT: vle32.v v26, (a2) +; NOREMAT-NEXT: vle32.v v2, (a2) +; NOREMAT-NEXT: slli a2, a3, 10 ; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v20 ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v8, (a2) -; NOREMAT-NEXT: vle32.v v6, (a2) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14 -; NOREMAT-NEXT: addiw a2, t4, -1536 +; NOREMAT-NEXT: vle32.v v20, (a2) +; NOREMAT-NEXT: vle32.v v30, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18 +; NOREMAT-NEXT: addiw a2, a6, 1536 ; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a7, a2 -; NOREMAT-NEXT: vle32.v v14, (a2) -; NOREMAT-NEXT: vle32.v v24, (a2) -; NOREMAT-NEXT: slli a2, t1, 10 +; NOREMAT-NEXT: vle32.v v18, (a2) +; NOREMAT-NEXT: vle32.v v6, (a2) +; NOREMAT-NEXT: slli a2, t0, 11 ; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v22 ; NOREMAT-NEXT: add a2, a7, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) -; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addiw a0, t4, -512 -; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: vle32.v v0, (a2) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v16 +; NOREMAT-NEXT: addiw a2, s5, -1536 +; NOREMAT-NEXT: sd a2, 280(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v16, (a2) +; NOREMAT-NEXT: vle32.v v24, (a2) +; NOREMAT-NEXT: slli a2, a5, 10 +; NOREMAT-NEXT: sd a2, 272(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28 +; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: vle32.v v28, (a2) +; NOREMAT-NEXT: vle32.v v4, (a2) +; NOREMAT-NEXT: addiw a0, s5, -512 +; NOREMAT-NEXT: sd a0, 264(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a0, a7, a0 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 -; NOREMAT-NEXT: vle32.v v12, (a0) -; NOREMAT-NEXT: vle32.v v0, (a0) -; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v26 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v18 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v16 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v8 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v14 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v22 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v20 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v10 +; NOREMAT-NEXT: vle32.v v10, (a0) +; NOREMAT-NEXT: vle32.v v14, (a0) +; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v26 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v20 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v18 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v22 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v0, v16 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v28 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v10 +; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v12 ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: addi a0, a1, 1024 ; NOREMAT-NEXT: vse32.v v8, (a0) -; NOREMAT-NEXT: add s11, a1, s11 -; NOREMAT-NEXT: sd s11, 272(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: lui a0, 1 +; NOREMAT-NEXT: add a0, a1, a0 +; NOREMAT-NEXT: sd a0, 256(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 2 ; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: sd a0, 264(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a0, 248(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 3 ; NOREMAT-NEXT: add a0, a1, a0 -; NOREMAT-NEXT: sd a0, 256(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a0, 240(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add s3, a1, s3 +; NOREMAT-NEXT: sd s3, 232(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add s11, a1, s11 +; NOREMAT-NEXT: sd s11, 224(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a4, a1, a4 -; NOREMAT-NEXT: sd a4, 248(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a5, a1, a5 -; NOREMAT-NEXT: sd a5, 240(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a4, 216(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a6, a1, a6 -; NOREMAT-NEXT: sd a6, 232(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add t3, a1, t3 -; NOREMAT-NEXT: sd t3, 224(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a1, t4 -; NOREMAT-NEXT: sd a0, 216(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t4, 512 -; NOREMAT-NEXT: sd a0, 192(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t4, 1024 +; NOREMAT-NEXT: sd a6, 208(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a1, s5 +; NOREMAT-NEXT: sd a0, 200(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addiw a0, s5, 512 ; NOREMAT-NEXT: sd a0, 176(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a0, t4, 1536 +; NOREMAT-NEXT: addiw a0, s5, 1024 ; NOREMAT-NEXT: sd a0, 160(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli s1, s1, 11 -; NOREMAT-NEXT: sd s1, 128(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addiw a0, s5, 1536 +; NOREMAT-NEXT: sd a0, 144(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli t4, t4, 11 +; NOREMAT-NEXT: sd t4, 112(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui a0, 9 ; NOREMAT-NEXT: addiw a2, a0, -1536 -; NOREMAT-NEXT: sd a2, 88(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: addiw a2, a0, -1024 ; NOREMAT-NEXT: sd a2, 72(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: addiw a2, a0, -1024 +; NOREMAT-NEXT: sd a2, 56(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: addiw a2, a0, -512 -; NOREMAT-NEXT: sd a2, 40(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a2, 24(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add a2, a1, a0 -; NOREMAT-NEXT: sd a2, 208(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a2, 192(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: addiw s11, a0, 512 ; NOREMAT-NEXT: addiw s7, a0, 1024 ; NOREMAT-NEXT: addiw s3, a0, 1536 -; NOREMAT-NEXT: slli s1, t2, 11 +; NOREMAT-NEXT: slli s1, s1, 11 ; NOREMAT-NEXT: lui a0, 10 ; NOREMAT-NEXT: addiw t2, a0, -1536 ; NOREMAT-NEXT: addiw a7, a0, -1024 ; NOREMAT-NEXT: addiw a4, a0, -512 ; NOREMAT-NEXT: add a2, a1, a0 -; NOREMAT-NEXT: sd a2, 200(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd a2, 184(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: addiw a0, a0, 512 -; NOREMAT-NEXT: ld a2, 512(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a2, a1, a2 -; NOREMAT-NEXT: ld a3, 504(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add a3, a1, a3 -; NOREMAT-NEXT: add a5, a1, t6 -; NOREMAT-NEXT: add a6, a1, s2 -; NOREMAT-NEXT: add t0, a1, s4 -; NOREMAT-NEXT: add t1, a1, s5 -; NOREMAT-NEXT: add t3, a1, s6 -; NOREMAT-NEXT: add t4, a1, s8 -; NOREMAT-NEXT: add t5, a1, s9 -; NOREMAT-NEXT: add t6, a1, s10 -; NOREMAT-NEXT: add s0, a1, ra -; NOREMAT-NEXT: ld s2, 624(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add a2, a1, t3 +; NOREMAT-NEXT: add a3, a1, s0 +; NOREMAT-NEXT: add a5, a1, s4 +; NOREMAT-NEXT: add a6, a1, s6 +; NOREMAT-NEXT: add t0, a1, s8 +; NOREMAT-NEXT: add t1, a1, s9 +; NOREMAT-NEXT: add t3, a1, s10 +; NOREMAT-NEXT: add t4, a1, ra +; NOREMAT-NEXT: add t5, a1, t5 +; NOREMAT-NEXT: ld t6, 608(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add t6, a1, t6 +; NOREMAT-NEXT: ld s0, 600(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add s0, a1, s0 +; NOREMAT-NEXT: ld s2, 592(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s2, a1, s2 -; NOREMAT-NEXT: ld s4, 616(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s4, 584(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s4, a1, s4 -; NOREMAT-NEXT: ld s5, 608(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s5, 576(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s5, a1, s5 -; NOREMAT-NEXT: ld s6, 600(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s6, 568(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s6, a1, s6 -; NOREMAT-NEXT: ld s8, 592(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s8, 560(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s8, a1, s8 -; NOREMAT-NEXT: ld s9, 584(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s9, 552(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s9, a1, s9 -; NOREMAT-NEXT: ld s10, 576(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s10, 544(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add s10, a1, s10 -; NOREMAT-NEXT: ld ra, 568(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 16(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 560(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 552(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 32(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 544(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 48(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 536(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 0(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 528(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 64(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 520(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 80(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 16(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 512(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 32(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 504(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 40(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 496(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 96(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 48(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 488(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 64(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 480(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 112(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 80(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 472(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 120(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 88(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 464(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 136(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 96(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 456(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 144(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 104(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 448(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 152(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 120(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 440(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 168(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 128(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 432(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 184(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 136(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 424(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 424(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 152(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 416(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 432(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 168(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 408(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 440(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 408(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 400(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 448(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 416(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 392(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 456(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 424(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 384(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 464(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 432(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 376(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 472(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 440(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 368(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 480(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 448(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 360(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 488(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 456(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 352(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 496(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 464(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 344(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 504(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 472(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 336(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 512(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 480(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 328(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 520(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 488(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 320(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 528(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 496(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 312(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 536(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 504(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 304(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 544(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 512(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 296(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 552(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 520(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 288(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 560(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: sd ra, 528(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: ld ra, 280(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 536(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 272(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 544(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 264(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 552(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 176(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra +; NOREMAT-NEXT: sd ra, 560(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: ld ra, 160(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 568(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 192(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 144(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 576(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 176(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 112(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 584(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 160(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 592(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 128(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 600(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: add ra, a1, ra ; NOREMAT-NEXT: sd ra, 608(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 616(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: add ra, a1, ra -; NOREMAT-NEXT: sd ra, 624(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: add ra, a1, s11 ; NOREMAT-NEXT: add s11, a1, s7 ; NOREMAT-NEXT: add s7, a1, s3 @@ -657,7 +646,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: vse32.v v8, (a6) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 272(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 256(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: vse32.v v8, (t0) @@ -674,7 +663,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: vse32.v v8, (s0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 264(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 248(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: vse32.v v8, (s2) @@ -691,31 +680,37 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: vse32.v v8, (s10) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 256(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 240(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 16(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 0(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 24(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 32(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 48(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 40(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 56(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 64(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 +; NOREMAT-NEXT: ld a0, 232(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: vse32.v v8, (a0) +; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 80(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 248(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 88(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 96(sp) # 8-byte Folded Reload @@ -724,28 +719,28 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: ld a0, 104(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 112(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 120(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 120(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 128(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 136(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 144(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 224(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 152(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 240(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 168(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 168(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 408(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 184(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 416(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 424(sp) # 8-byte Folded Reload @@ -757,13 +752,13 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: ld a0, 440(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 448(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 216(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 456(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 448(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 232(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 456(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 464(sp) # 8-byte Folded Reload @@ -781,13 +776,13 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: ld a0, 496(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 504(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 208(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 512(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 504(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 224(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 512(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 520(sp) # 8-byte Folded Reload @@ -805,13 +800,13 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: ld a0, 552(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 560(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 200(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 568(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 560(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 216(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 568(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: ld a0, 576(sp) # 8-byte Folded Reload @@ -829,13 +824,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: ld a0, 608(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 616(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: vse32.v v8, (a0) -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 624(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: vse32.v v8, (a0) -; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 208(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 192(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: vse32.v v8, (ra) @@ -852,29 +841,25 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: vse32.v v8, (a7) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: ld a0, 200(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld a0, 184(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: vse32.v v8, (a0) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: vse32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; NOREMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 -; NOREMAT-NEXT: csrr a0, vlenb -; NOREMAT-NEXT: slli a0, a0, 1 -; NOREMAT-NEXT: add sp, sp, a0 -; NOREMAT-NEXT: .cfi_def_cfa sp, 752 -; NOREMAT-NEXT: ld ra, 744(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s0, 736(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s1, 728(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s2, 720(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s3, 712(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s4, 704(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s5, 696(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s6, 688(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s7, 680(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s8, 672(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s9, 664(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s10, 656(sp) # 8-byte Folded Reload -; NOREMAT-NEXT: ld s11, 648(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld ra, 712(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s0, 704(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s1, 696(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s2, 688(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s3, 680(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s4, 672(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s5, 664(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s6, 656(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s7, 648(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s8, 640(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s9, 632(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s10, 624(sp) # 8-byte Folded Reload +; NOREMAT-NEXT: ld s11, 616(sp) # 8-byte Folded Reload ; NOREMAT-NEXT: .cfi_restore ra ; NOREMAT-NEXT: .cfi_restore s0 ; NOREMAT-NEXT: .cfi_restore s1 @@ -888,7 +873,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: .cfi_restore s9 ; NOREMAT-NEXT: .cfi_restore s10 ; NOREMAT-NEXT: .cfi_restore s11 -; NOREMAT-NEXT: addi sp, sp, 752 +; NOREMAT-NEXT: addi sp, sp, 720 ; NOREMAT-NEXT: .cfi_def_cfa_offset 0 ; NOREMAT-NEXT: ret ; @@ -923,10 +908,10 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: .cfi_offset s10, -96 ; REMAT-NEXT: .cfi_offset s11, -104 ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 18 +; REMAT-NEXT: li a3, 14 ; REMAT-NEXT: mul a2, a2, a3 ; REMAT-NEXT: sub sp, sp, a2 -; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 18 * vlenb +; REMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x0e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 14 * vlenb ; REMAT-NEXT: li a4, 32 ; REMAT-NEXT: addi a5, a0, 512 ; REMAT-NEXT: addi a3, a0, 1024 @@ -976,51 +961,32 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: vle32.v v8, (a5) ; REMAT-NEXT: li a4, 13 ; REMAT-NEXT: slli a4, a4, 10 -; REMAT-NEXT: vle32.v v10, (a3) ; REMAT-NEXT: vle32.v v12, (a3) +; REMAT-NEXT: vle32.v v14, (a3) ; REMAT-NEXT: li a3, 27 ; REMAT-NEXT: slli a3, a3, 9 -; REMAT-NEXT: vle32.v v14, (a2) ; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: add a2, a0, a6 ; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: add a2, a0, a6 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: add a2, a0, a7 ; REMAT-NEXT: vle32.v v22, (a2) +; REMAT-NEXT: add a2, a0, a7 ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: add a2, a0, t0 ; REMAT-NEXT: vle32.v v26, (a2) +; REMAT-NEXT: add a2, a0, t0 ; REMAT-NEXT: vle32.v v28, (a2) -; REMAT-NEXT: add a2, a0, t1 ; REMAT-NEXT: vle32.v v30, (a2) +; REMAT-NEXT: add a2, a0, t1 ; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 4 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: add a2, a0, t2 -; REMAT-NEXT: vle32.v v4, (a0) ; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a5, 14 -; REMAT-NEXT: mul a2, a2, a5 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: add a2, a0, t3 -; REMAT-NEXT: sf.vc.vv 3, 0, v4, v8 -; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 -; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: add a2, a0, t4 ; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v12, v14 +; REMAT-NEXT: add a2, a0, t3 +; REMAT-NEXT: vle32.v v0, (a0) +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v8 ; REMAT-NEXT: vle32.v v0, (a2) -; REMAT-NEXT: add a2, a0, t5 -; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v18 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v12 ; REMAT-NEXT: vle32.v v8, (a2) ; REMAT-NEXT: csrr a2, vlenb ; REMAT-NEXT: li a5, 12 @@ -1028,117 +994,112 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill -; REMAT-NEXT: add a2, a0, t6 +; REMAT-NEXT: add a2, a0, t4 +; REMAT-NEXT: vle32.v v12, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v14, v16 +; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: add a2, a0, t5 +; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v20 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22 +; REMAT-NEXT: add a2, a0, t6 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: add a2, a0, s0 +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v24, v26 +; REMAT-NEXT: add a2, a0, s0 ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: add a2, a0, s1 +; REMAT-NEXT: sf.vc.vv 3, 0, v26, v28 ; REMAT-NEXT: vle32.v v26, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v28, v30 +; REMAT-NEXT: add a2, a0, s1 ; REMAT-NEXT: vle32.v v28, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v30, v6 +; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: add a2, a0, s2 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: slli a5, a5, 4 -; REMAT-NEXT: add a5, sp, a5 -; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v12, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v12, v2 -; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v4, v2 +; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: add a2, a0, s3 -; REMAT-NEXT: vle32.v v12, (a2) +; REMAT-NEXT: vle32.v v2, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v0 +; REMAT-NEXT: vle32.v v0, (a2) +; REMAT-NEXT: add a2, a0, s4 +; REMAT-NEXT: vle32.v v10, (a2) ; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: li a6, 14 +; REMAT-NEXT: li a6, 12 ; REMAT-NEXT: mul a5, a5, a6 ; REMAT-NEXT: add a5, sp, a5 ; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v16, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v4 +; REMAT-NEXT: vl2r.v v30, (a5) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v30, v12 ; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: add a2, a0, s4 -; REMAT-NEXT: vle32.v v16, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10 -; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: add a2, a0, s5 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v14 -; REMAT-NEXT: vle32.v v4, (a2) +; REMAT-NEXT: vle32.v v12, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v14, v16 +; REMAT-NEXT: vle32.v v14, (a2) +; REMAT-NEXT: csrr a2, vlenb +; REMAT-NEXT: li a5, 12 +; REMAT-NEXT: mul a2, a2, a5 +; REMAT-NEXT: add a2, sp, a2 +; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: add a2, a0, s6 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: csrr a5, vlenb -; REMAT-NEXT: li a6, 12 -; REMAT-NEXT: mul a5, a5, a6 -; REMAT-NEXT: add a5, sp, a5 -; REMAT-NEXT: addi a5, a5, 432 -; REMAT-NEXT: vl2r.v v0, (a5) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18 -; REMAT-NEXT: vle32.v v0, (a2) -; REMAT-NEXT: add a2, a0, s7 +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v20 ; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v20, v22 +; REMAT-NEXT: add a2, a0, s7 +; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) ; REMAT-NEXT: add a2, a0, s8 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v24, v26 +; REMAT-NEXT: sf.vc.vv 3, 0, v26, v28 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: add a2, a0, s9 ; REMAT-NEXT: vle32.v v24, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8 -; REMAT-NEXT: vle32.v v28, (a2) -; REMAT-NEXT: add a2, a0, s10 +; REMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; REMAT-NEXT: vle32.v v8, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v2, v12 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 3 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v12, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: add a2, a0, s10 +; REMAT-NEXT: vle32.v v6, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v4, v2 +; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: add a2, a0, s11 -; REMAT-NEXT: vle32.v v12, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v30, v16 -; REMAT-NEXT: vle32.v v16, (a2) +; REMAT-NEXT: vle32.v v4, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v0, v10 +; REMAT-NEXT: vle32.v v10, (a2) ; REMAT-NEXT: add a2, a0, ra ; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v6, v10 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 1 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: sf.vc.vv 3, 0, v30, v12 +; REMAT-NEXT: vle32.v v12, (a2) ; REMAT-NEXT: add a2, a0, a4 -; REMAT-NEXT: vle32.v v10, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v4, v14 +; REMAT-NEXT: vle32.v v0, (a2) +; REMAT-NEXT: csrr a4, vlenb +; REMAT-NEXT: li a5, 12 +; REMAT-NEXT: mul a4, a4, a5 +; REMAT-NEXT: add a4, sp, a4 +; REMAT-NEXT: addi a4, a4, 432 +; REMAT-NEXT: vl2r.v v30, (a4) # Unknown-size Folded Reload +; REMAT-NEXT: sf.vc.vv 3, 0, v30, v14 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 2 -; REMAT-NEXT: add a2, sp, a2 -; REMAT-NEXT: addi a2, a2, 432 +; REMAT-NEXT: addi a2, sp, 432 ; REMAT-NEXT: vs2r.v v14, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: add a2, a0, a3 ; REMAT-NEXT: vle32.v v14, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v0, v18 -; REMAT-NEXT: vle32.v v18, (a2) +; REMAT-NEXT: sf.vc.vv 3, 0, v18, v16 +; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 4 +; REMAT-NEXT: li a3, 12 +; REMAT-NEXT: mul a2, a2, a3 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 -; REMAT-NEXT: vs2r.v v18, (a2) # Unknown-size Folded Spill +; REMAT-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: li a5, 7 ; REMAT-NEXT: slli a5, a5, 11 ; REMAT-NEXT: add a2, a0, a5 -; REMAT-NEXT: vle32.v v18, (a2) -; REMAT-NEXT: addi a3, sp, 432 -; REMAT-NEXT: vs2r.v v18, (a3) # Unknown-size Folded Spill +; REMAT-NEXT: vle32.v v16, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v20 ; REMAT-NEXT: vle32.v v18, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 14 +; REMAT-NEXT: li a3, 10 ; REMAT-NEXT: mul a2, a2, a3 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 @@ -1150,8 +1111,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: sf.vc.vv 3, 0, v26, v24 ; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 12 -; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: slli a2, a2, 3 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v20, (a2) # Unknown-size Folded Spill @@ -1159,10 +1119,10 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v30, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v28, v8 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v6 ; REMAT-NEXT: vle32.v v8, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 10 +; REMAT-NEXT: li a3, 6 ; REMAT-NEXT: mul a2, a2, a3 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 @@ -1171,26 +1131,20 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: slli a2, a2, 9 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v6, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 3 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 -; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v12 +; REMAT-NEXT: sf.vc.vv 3, 0, v28, v4 ; REMAT-NEXT: vle32.v v8, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: slli a2, a2, 3 +; REMAT-NEXT: slli a2, a2, 2 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill ; REMAT-NEXT: lui a2, 4 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v4, (a2) -; REMAT-NEXT: sf.vc.vv 3, 0, v16, v2 +; REMAT-NEXT: sf.vc.vv 3, 0, v10, v2 ; REMAT-NEXT: vle32.v v8, (a2) ; REMAT-NEXT: csrr a2, vlenb -; REMAT-NEXT: li a3, 6 -; REMAT-NEXT: mul a2, a2, a3 +; REMAT-NEXT: slli a2, a2, 1 ; REMAT-NEXT: add a2, sp, a2 ; REMAT-NEXT: addi a2, a2, 432 ; REMAT-NEXT: vs2r.v v8, (a2) # Unknown-size Folded Spill @@ -1198,21 +1152,13 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: addiw a2, a2, 512 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v2, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 1 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 -; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 +; REMAT-NEXT: sf.vc.vv 3, 0, v12, v0 ; REMAT-NEXT: vle32.v v20, (a2) ; REMAT-NEXT: li a2, 17 ; REMAT-NEXT: slli a2, a2, 10 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) -; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 2 -; REMAT-NEXT: add a3, sp, a3 -; REMAT-NEXT: addi a3, a3, 432 +; REMAT-NEXT: addi a3, sp, 432 ; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v14 ; REMAT-NEXT: vle32.v v22, (a2) @@ -1221,20 +1167,19 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 4 +; REMAT-NEXT: li a4, 12 +; REMAT-NEXT: mul a3, a3, a4 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: addi a3, sp, 432 -; REMAT-NEXT: vl2r.v v10, (a3) # Unknown-size Folded Reload -; REMAT-NEXT: sf.vc.vv 3, 0, v8, v10 +; REMAT-NEXT: sf.vc.vv 3, 0, v8, v16 ; REMAT-NEXT: vle32.v v8, (a2) ; REMAT-NEXT: li a2, 9 ; REMAT-NEXT: slli a2, a2, 11 ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v26, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 14 +; REMAT-NEXT: li a4, 10 ; REMAT-NEXT: mul a3, a3, a4 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 @@ -1246,8 +1191,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v28, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 12 -; REMAT-NEXT: mul a3, a3, a4 +; REMAT-NEXT: slli a3, a3, 3 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v12, (a3) # Unknown-size Folded Reload @@ -1258,7 +1202,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v30, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 10 +; REMAT-NEXT: li a4, 6 ; REMAT-NEXT: mul a3, a3, a4 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 @@ -1270,7 +1214,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v6, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: slli a3, a3, 3 +; REMAT-NEXT: slli a3, a3, 2 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v16, (a3) # Unknown-size Folded Reload @@ -1280,8 +1224,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v4, (a2) ; REMAT-NEXT: csrr a3, vlenb -; REMAT-NEXT: li a4, 6 -; REMAT-NEXT: mul a3, a3, a4 +; REMAT-NEXT: slli a3, a3, 1 ; REMAT-NEXT: add a3, sp, a3 ; REMAT-NEXT: addi a3, a3, 432 ; REMAT-NEXT: vl2r.v v18, (a3) # Unknown-size Folded Reload @@ -1293,15 +1236,15 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: vle32.v v2, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v20, v0 ; REMAT-NEXT: vle32.v v20, (a2) -; REMAT-NEXT: li s7, 21 -; REMAT-NEXT: slli s7, s7, 10 -; REMAT-NEXT: add a2, a0, s7 +; REMAT-NEXT: li a2, 21 +; REMAT-NEXT: slli a2, a2, 10 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v0, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v22, v24 ; REMAT-NEXT: vle32.v v22, (a2) -; REMAT-NEXT: lui s4, 5 -; REMAT-NEXT: addiw s4, s4, 1536 -; REMAT-NEXT: add a2, a0, s4 +; REMAT-NEXT: lui a2, 5 +; REMAT-NEXT: addiw a2, a2, 1536 +; REMAT-NEXT: add a2, a0, a2 ; REMAT-NEXT: vle32.v v24, (a2) ; REMAT-NEXT: sf.vc.vv 3, 0, v8, v26 ; REMAT-NEXT: vle32.v v8, (a2) @@ -1489,18 +1432,14 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: lui a0, 2 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 320(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 17 -; REMAT-NEXT: slli a0, a0, 9 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 312(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s4, a1, s4 +; REMAT-NEXT: sd s4, 312(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s5, a1, s5 ; REMAT-NEXT: sd s5, 304(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s6, a1, s6 ; REMAT-NEXT: sd s6, 296(sp) # 8-byte Folded Spill -; REMAT-NEXT: li a0, 5 -; REMAT-NEXT: slli a0, a0, 11 -; REMAT-NEXT: add a0, a1, a0 -; REMAT-NEXT: sd a0, 288(sp) # 8-byte Folded Spill +; REMAT-NEXT: add s7, a1, s7 +; REMAT-NEXT: sd s7, 288(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s8, a1, s8 ; REMAT-NEXT: sd s8, 280(sp) # 8-byte Folded Spill ; REMAT-NEXT: add s9, a1, s9 @@ -1571,10 +1510,14 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: addiw a0, a0, 512 ; REMAT-NEXT: add a0, a1, a0 ; REMAT-NEXT: sd a0, 120(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s7, a1, s7 -; REMAT-NEXT: sd s7, 112(sp) # 8-byte Folded Spill -; REMAT-NEXT: add s4, a1, s4 -; REMAT-NEXT: sd s4, 104(sp) # 8-byte Folded Spill +; REMAT-NEXT: li a0, 21 +; REMAT-NEXT: slli a0, a0, 10 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 112(sp) # 8-byte Folded Spill +; REMAT-NEXT: lui a0, 5 +; REMAT-NEXT: addiw a0, a0, 1536 +; REMAT-NEXT: add a0, a1, a0 +; REMAT-NEXT: sd a0, 104(sp) # 8-byte Folded Spill ; REMAT-NEXT: li a0, 11 ; REMAT-NEXT: slli a0, a0, 11 ; REMAT-NEXT: add a0, a1, a0 @@ -1879,7 +1822,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: sf.vc.v.i 2, 0, v8, 0 ; REMAT-NEXT: csrr a0, vlenb -; REMAT-NEXT: li a1, 18 +; REMAT-NEXT: li a1, 14 ; REMAT-NEXT: mul a0, a0, a1 ; REMAT-NEXT: add sp, sp, a0 ; REMAT-NEXT: .cfi_def_cfa sp, 544 diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll index 1fbdaa76dfb68..5ce5849af700c 100644 --- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll +++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll @@ -1174,16 +1174,16 @@ define i32 @varargs(ptr %fmt, ...) { ; RV64IZCMP: # %bb.0: ; RV64IZCMP-NEXT: addi sp, sp, -80 ; RV64IZCMP-NEXT: .cfi_def_cfa_offset 80 -; RV64IZCMP-NEXT: sd a1, 24(sp) -; RV64IZCMP-NEXT: addi a0, sp, 28 -; RV64IZCMP-NEXT: sd a0, 8(sp) -; RV64IZCMP-NEXT: lw a0, 24(sp) ; RV64IZCMP-NEXT: sd a5, 56(sp) ; RV64IZCMP-NEXT: sd a6, 64(sp) ; RV64IZCMP-NEXT: sd a7, 72(sp) +; RV64IZCMP-NEXT: sd a1, 24(sp) ; RV64IZCMP-NEXT: sd a2, 32(sp) ; RV64IZCMP-NEXT: sd a3, 40(sp) ; RV64IZCMP-NEXT: sd a4, 48(sp) +; RV64IZCMP-NEXT: addi a0, sp, 28 +; RV64IZCMP-NEXT: sd a0, 8(sp) +; RV64IZCMP-NEXT: lw a0, 24(sp) ; RV64IZCMP-NEXT: addi sp, sp, 80 ; RV64IZCMP-NEXT: .cfi_def_cfa_offset 0 ; RV64IZCMP-NEXT: ret @@ -1210,16 +1210,16 @@ define i32 @varargs(ptr %fmt, ...) { ; RV64IZCMP-SR: # %bb.0: ; RV64IZCMP-SR-NEXT: addi sp, sp, -80 ; RV64IZCMP-SR-NEXT: .cfi_def_cfa_offset 80 -; RV64IZCMP-SR-NEXT: sd a1, 24(sp) -; RV64IZCMP-SR-NEXT: addi a0, sp, 28 -; RV64IZCMP-SR-NEXT: sd a0, 8(sp) -; RV64IZCMP-SR-NEXT: lw a0, 24(sp) ; RV64IZCMP-SR-NEXT: sd a5, 56(sp) ; RV64IZCMP-SR-NEXT: sd a6, 64(sp) ; RV64IZCMP-SR-NEXT: sd a7, 72(sp) +; RV64IZCMP-SR-NEXT: sd a1, 24(sp) ; RV64IZCMP-SR-NEXT: sd a2, 32(sp) ; RV64IZCMP-SR-NEXT: sd a3, 40(sp) ; RV64IZCMP-SR-NEXT: sd a4, 48(sp) +; RV64IZCMP-SR-NEXT: addi a0, sp, 28 +; RV64IZCMP-SR-NEXT: sd a0, 8(sp) +; RV64IZCMP-SR-NEXT: lw a0, 24(sp) ; RV64IZCMP-SR-NEXT: addi sp, sp, 80 ; RV64IZCMP-SR-NEXT: .cfi_def_cfa_offset 0 ; RV64IZCMP-SR-NEXT: ret @@ -1246,16 +1246,16 @@ define i32 @varargs(ptr %fmt, ...) { ; RV64I: # %bb.0: ; RV64I-NEXT: addi sp, sp, -80 ; RV64I-NEXT: .cfi_def_cfa_offset 80 -; RV64I-NEXT: sd a1, 24(sp) -; RV64I-NEXT: addi a0, sp, 28 -; RV64I-NEXT: sd a0, 8(sp) -; RV64I-NEXT: lw a0, 24(sp) ; RV64I-NEXT: sd a5, 56(sp) ; RV64I-NEXT: sd a6, 64(sp) ; RV64I-NEXT: sd a7, 72(sp) +; RV64I-NEXT: sd a1, 24(sp) ; RV64I-NEXT: sd a2, 32(sp) ; RV64I-NEXT: sd a3, 40(sp) ; RV64I-NEXT: sd a4, 48(sp) +; RV64I-NEXT: addi a0, sp, 28 +; RV64I-NEXT: sd a0, 8(sp) +; RV64I-NEXT: lw a0, 24(sp) ; RV64I-NEXT: addi sp, sp, 80 ; RV64I-NEXT: .cfi_def_cfa_offset 0 ; RV64I-NEXT: ret @@ -1291,26 +1291,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) { ; RV32IZCMP-NEXT: lw t3, 20(a5) ; RV32IZCMP-NEXT: lw t4, 24(a5) ; RV32IZCMP-NEXT: lw t5, 28(a5) -; RV32IZCMP-NEXT: lw t6, 48(a5) -; RV32IZCMP-NEXT: lw s2, 52(a5) -; RV32IZCMP-NEXT: lw a3, 56(a5) -; RV32IZCMP-NEXT: lw a4, 60(a5) -; RV32IZCMP-NEXT: lw a1, 64(a5) -; RV32IZCMP-NEXT: lw s0, 68(a5) -; RV32IZCMP-NEXT: lw s3, 32(a5) -; RV32IZCMP-NEXT: lw s4, 36(a5) -; RV32IZCMP-NEXT: lw s1, 40(a5) -; RV32IZCMP-NEXT: lw a2, 44(a5) -; RV32IZCMP-NEXT: sw s0, 68(a5) -; RV32IZCMP-NEXT: sw a1, 64(a5) -; RV32IZCMP-NEXT: sw a4, 60(a5) -; RV32IZCMP-NEXT: sw a3, 56(a5) -; RV32IZCMP-NEXT: sw s2, 52(a5) -; RV32IZCMP-NEXT: sw t6, 48(a5) -; RV32IZCMP-NEXT: sw a2, 44(a5) -; RV32IZCMP-NEXT: sw s1, 40(a5) -; RV32IZCMP-NEXT: sw s4, 36(a5) -; RV32IZCMP-NEXT: sw s3, 32(a5) +; RV32IZCMP-NEXT: lw t6, 32(a5) +; RV32IZCMP-NEXT: lw s2, 36(a5) +; RV32IZCMP-NEXT: lw s3, 40(a5) +; RV32IZCMP-NEXT: lw s4, 44(a5) +; RV32IZCMP-NEXT: lw a1, 48(a5) +; RV32IZCMP-NEXT: lw s0, 52(a5) +; RV32IZCMP-NEXT: lw s1, 56(a5) +; RV32IZCMP-NEXT: lw a2, 60(a5) +; RV32IZCMP-NEXT: lw a3, 64(a5) +; RV32IZCMP-NEXT: lw a4, 68(a5) +; RV32IZCMP-NEXT: sw a4, 68(a5) +; RV32IZCMP-NEXT: sw a3, 64(a5) +; RV32IZCMP-NEXT: sw a2, 60(a5) +; RV32IZCMP-NEXT: sw s1, 56(a5) +; RV32IZCMP-NEXT: sw s0, 52(a5) +; RV32IZCMP-NEXT: sw a1, 48(a5) +; RV32IZCMP-NEXT: sw s4, 44(a5) +; RV32IZCMP-NEXT: sw s3, 40(a5) +; RV32IZCMP-NEXT: sw s2, 36(a5) +; RV32IZCMP-NEXT: sw t6, 32(a5) ; RV32IZCMP-NEXT: sw t5, 28(a5) ; RV32IZCMP-NEXT: sw t4, 24(a5) ; RV32IZCMP-NEXT: sw t3, 20(a5) @@ -1340,26 +1340,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) { ; RV64IZCMP-NEXT: lw t3, 20(a5) ; RV64IZCMP-NEXT: lw t4, 24(a5) ; RV64IZCMP-NEXT: lw t5, 28(a5) -; RV64IZCMP-NEXT: lw t6, 48(a5) -; RV64IZCMP-NEXT: lw s2, 52(a5) -; RV64IZCMP-NEXT: lw a3, 56(a5) -; RV64IZCMP-NEXT: lw a4, 60(a5) -; RV64IZCMP-NEXT: lw a1, 64(a5) -; RV64IZCMP-NEXT: lw s0, 68(a5) -; RV64IZCMP-NEXT: lw s3, 32(a5) -; RV64IZCMP-NEXT: lw s4, 36(a5) -; RV64IZCMP-NEXT: lw s1, 40(a5) -; RV64IZCMP-NEXT: lw a2, 44(a5) -; RV64IZCMP-NEXT: sw s0, 68(a5) -; RV64IZCMP-NEXT: sw a1, 64(a5) -; RV64IZCMP-NEXT: sw a4, 60(a5) -; RV64IZCMP-NEXT: sw a3, 56(a5) -; RV64IZCMP-NEXT: sw s2, 52(a5) -; RV64IZCMP-NEXT: sw t6, 48(a5) -; RV64IZCMP-NEXT: sw a2, 44(a5) -; RV64IZCMP-NEXT: sw s1, 40(a5) -; RV64IZCMP-NEXT: sw s4, 36(a5) -; RV64IZCMP-NEXT: sw s3, 32(a5) +; RV64IZCMP-NEXT: lw t6, 32(a5) +; RV64IZCMP-NEXT: lw s2, 36(a5) +; RV64IZCMP-NEXT: lw s3, 40(a5) +; RV64IZCMP-NEXT: lw s4, 44(a5) +; RV64IZCMP-NEXT: lw a1, 48(a5) +; RV64IZCMP-NEXT: lw s0, 52(a5) +; RV64IZCMP-NEXT: lw s1, 56(a5) +; RV64IZCMP-NEXT: lw a2, 60(a5) +; RV64IZCMP-NEXT: lw a3, 64(a5) +; RV64IZCMP-NEXT: lw a4, 68(a5) +; RV64IZCMP-NEXT: sw a4, 68(a5) +; RV64IZCMP-NEXT: sw a3, 64(a5) +; RV64IZCMP-NEXT: sw a2, 60(a5) +; RV64IZCMP-NEXT: sw s1, 56(a5) +; RV64IZCMP-NEXT: sw s0, 52(a5) +; RV64IZCMP-NEXT: sw a1, 48(a5) +; RV64IZCMP-NEXT: sw s4, 44(a5) +; RV64IZCMP-NEXT: sw s3, 40(a5) +; RV64IZCMP-NEXT: sw s2, 36(a5) +; RV64IZCMP-NEXT: sw t6, 32(a5) ; RV64IZCMP-NEXT: sw t5, 28(a5) ; RV64IZCMP-NEXT: sw t4, 24(a5) ; RV64IZCMP-NEXT: sw t3, 20(a5) @@ -1389,26 +1389,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) { ; RV32IZCMP-SR-NEXT: lw t3, 20(a5) ; RV32IZCMP-SR-NEXT: lw t4, 24(a5) ; RV32IZCMP-SR-NEXT: lw t5, 28(a5) -; RV32IZCMP-SR-NEXT: lw t6, 48(a5) -; RV32IZCMP-SR-NEXT: lw s2, 52(a5) -; RV32IZCMP-SR-NEXT: lw a3, 56(a5) -; RV32IZCMP-SR-NEXT: lw a4, 60(a5) -; RV32IZCMP-SR-NEXT: lw a1, 64(a5) -; RV32IZCMP-SR-NEXT: lw s0, 68(a5) -; RV32IZCMP-SR-NEXT: lw s3, 32(a5) -; RV32IZCMP-SR-NEXT: lw s4, 36(a5) -; RV32IZCMP-SR-NEXT: lw s1, 40(a5) -; RV32IZCMP-SR-NEXT: lw a2, 44(a5) -; RV32IZCMP-SR-NEXT: sw s0, 68(a5) -; RV32IZCMP-SR-NEXT: sw a1, 64(a5) -; RV32IZCMP-SR-NEXT: sw a4, 60(a5) -; RV32IZCMP-SR-NEXT: sw a3, 56(a5) -; RV32IZCMP-SR-NEXT: sw s2, 52(a5) -; RV32IZCMP-SR-NEXT: sw t6, 48(a5) -; RV32IZCMP-SR-NEXT: sw a2, 44(a5) -; RV32IZCMP-SR-NEXT: sw s1, 40(a5) -; RV32IZCMP-SR-NEXT: sw s4, 36(a5) -; RV32IZCMP-SR-NEXT: sw s3, 32(a5) +; RV32IZCMP-SR-NEXT: lw t6, 32(a5) +; RV32IZCMP-SR-NEXT: lw s2, 36(a5) +; RV32IZCMP-SR-NEXT: lw s3, 40(a5) +; RV32IZCMP-SR-NEXT: lw s4, 44(a5) +; RV32IZCMP-SR-NEXT: lw a1, 48(a5) +; RV32IZCMP-SR-NEXT: lw s0, 52(a5) +; RV32IZCMP-SR-NEXT: lw s1, 56(a5) +; RV32IZCMP-SR-NEXT: lw a2, 60(a5) +; RV32IZCMP-SR-NEXT: lw a3, 64(a5) +; RV32IZCMP-SR-NEXT: lw a4, 68(a5) +; RV32IZCMP-SR-NEXT: sw a4, 68(a5) +; RV32IZCMP-SR-NEXT: sw a3, 64(a5) +; RV32IZCMP-SR-NEXT: sw a2, 60(a5) +; RV32IZCMP-SR-NEXT: sw s1, 56(a5) +; RV32IZCMP-SR-NEXT: sw s0, 52(a5) +; RV32IZCMP-SR-NEXT: sw a1, 48(a5) +; RV32IZCMP-SR-NEXT: sw s4, 44(a5) +; RV32IZCMP-SR-NEXT: sw s3, 40(a5) +; RV32IZCMP-SR-NEXT: sw s2, 36(a5) +; RV32IZCMP-SR-NEXT: sw t6, 32(a5) ; RV32IZCMP-SR-NEXT: sw t5, 28(a5) ; RV32IZCMP-SR-NEXT: sw t4, 24(a5) ; RV32IZCMP-SR-NEXT: sw t3, 20(a5) @@ -1438,26 +1438,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) { ; RV64IZCMP-SR-NEXT: lw t3, 20(a5) ; RV64IZCMP-SR-NEXT: lw t4, 24(a5) ; RV64IZCMP-SR-NEXT: lw t5, 28(a5) -; RV64IZCMP-SR-NEXT: lw t6, 48(a5) -; RV64IZCMP-SR-NEXT: lw s2, 52(a5) -; RV64IZCMP-SR-NEXT: lw a3, 56(a5) -; RV64IZCMP-SR-NEXT: lw a4, 60(a5) -; RV64IZCMP-SR-NEXT: lw a1, 64(a5) -; RV64IZCMP-SR-NEXT: lw s0, 68(a5) -; RV64IZCMP-SR-NEXT: lw s3, 32(a5) -; RV64IZCMP-SR-NEXT: lw s4, 36(a5) -; RV64IZCMP-SR-NEXT: lw s1, 40(a5) -; RV64IZCMP-SR-NEXT: lw a2, 44(a5) -; RV64IZCMP-SR-NEXT: sw s0, 68(a5) -; RV64IZCMP-SR-NEXT: sw a1, 64(a5) -; RV64IZCMP-SR-NEXT: sw a4, 60(a5) -; RV64IZCMP-SR-NEXT: sw a3, 56(a5) -; RV64IZCMP-SR-NEXT: sw s2, 52(a5) -; RV64IZCMP-SR-NEXT: sw t6, 48(a5) -; RV64IZCMP-SR-NEXT: sw a2, 44(a5) -; RV64IZCMP-SR-NEXT: sw s1, 40(a5) -; RV64IZCMP-SR-NEXT: sw s4, 36(a5) -; RV64IZCMP-SR-NEXT: sw s3, 32(a5) +; RV64IZCMP-SR-NEXT: lw t6, 32(a5) +; RV64IZCMP-SR-NEXT: lw s2, 36(a5) +; RV64IZCMP-SR-NEXT: lw s3, 40(a5) +; RV64IZCMP-SR-NEXT: lw s4, 44(a5) +; RV64IZCMP-SR-NEXT: lw a1, 48(a5) +; RV64IZCMP-SR-NEXT: lw s0, 52(a5) +; RV64IZCMP-SR-NEXT: lw s1, 56(a5) +; RV64IZCMP-SR-NEXT: lw a2, 60(a5) +; RV64IZCMP-SR-NEXT: lw a3, 64(a5) +; RV64IZCMP-SR-NEXT: lw a4, 68(a5) +; RV64IZCMP-SR-NEXT: sw a4, 68(a5) +; RV64IZCMP-SR-NEXT: sw a3, 64(a5) +; RV64IZCMP-SR-NEXT: sw a2, 60(a5) +; RV64IZCMP-SR-NEXT: sw s1, 56(a5) +; RV64IZCMP-SR-NEXT: sw s0, 52(a5) +; RV64IZCMP-SR-NEXT: sw a1, 48(a5) +; RV64IZCMP-SR-NEXT: sw s4, 44(a5) +; RV64IZCMP-SR-NEXT: sw s3, 40(a5) +; RV64IZCMP-SR-NEXT: sw s2, 36(a5) +; RV64IZCMP-SR-NEXT: sw t6, 32(a5) ; RV64IZCMP-SR-NEXT: sw t5, 28(a5) ; RV64IZCMP-SR-NEXT: sw t4, 24(a5) ; RV64IZCMP-SR-NEXT: sw t3, 20(a5) @@ -1492,26 +1492,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) { ; RV32I-NEXT: lw a7, 20(a5) ; RV32I-NEXT: lw t0, 24(a5) ; RV32I-NEXT: lw t1, 28(a5) -; RV32I-NEXT: lw t2, 48(a5) -; RV32I-NEXT: lw t3, 52(a5) -; RV32I-NEXT: lw t4, 56(a5) -; RV32I-NEXT: lw t5, 60(a5) -; RV32I-NEXT: lw t6, 64(a5) -; RV32I-NEXT: lw s0, 68(a5) -; RV32I-NEXT: lw s1, 32(a5) -; RV32I-NEXT: lw s2, 36(a5) -; RV32I-NEXT: lw s3, 40(a5) -; RV32I-NEXT: lw s4, 44(a5) -; RV32I-NEXT: sw s0, 68(a5) -; RV32I-NEXT: sw t6, 64(a5) -; RV32I-NEXT: sw t5, 60(a5) -; RV32I-NEXT: sw t4, 56(a5) -; RV32I-NEXT: sw t3, 52(a5) -; RV32I-NEXT: sw t2, 48(a5) -; RV32I-NEXT: sw s4, 44(a5) -; RV32I-NEXT: sw s3, 40(a5) -; RV32I-NEXT: sw s2, 36(a5) -; RV32I-NEXT: sw s1, 32(a5) +; RV32I-NEXT: lw t2, 32(a5) +; RV32I-NEXT: lw t3, 36(a5) +; RV32I-NEXT: lw t4, 40(a5) +; RV32I-NEXT: lw t5, 44(a5) +; RV32I-NEXT: lw t6, 48(a5) +; RV32I-NEXT: lw s0, 52(a5) +; RV32I-NEXT: lw s1, 56(a5) +; RV32I-NEXT: lw s2, 60(a5) +; RV32I-NEXT: lw s3, 64(a5) +; RV32I-NEXT: lw s4, 68(a5) +; RV32I-NEXT: sw s4, 68(a5) +; RV32I-NEXT: sw s3, 64(a5) +; RV32I-NEXT: sw s2, 60(a5) +; RV32I-NEXT: sw s1, 56(a5) +; RV32I-NEXT: sw s0, 52(a5) +; RV32I-NEXT: sw t6, 48(a5) +; RV32I-NEXT: sw t5, 44(a5) +; RV32I-NEXT: sw t4, 40(a5) +; RV32I-NEXT: sw t3, 36(a5) +; RV32I-NEXT: sw t2, 32(a5) ; RV32I-NEXT: sw t1, 28(a5) ; RV32I-NEXT: sw t0, 24(a5) ; RV32I-NEXT: sw a7, 20(a5) @@ -1558,26 +1558,26 @@ define void @many_args(i32, i32, i32, i32, i32, i32, i32, i32, i32) { ; RV64I-NEXT: lw a7, 20(a5) ; RV64I-NEXT: lw t0, 24(a5) ; RV64I-NEXT: lw t1, 28(a5) -; RV64I-NEXT: lw t2, 48(a5) -; RV64I-NEXT: lw t3, 52(a5) -; RV64I-NEXT: lw t4, 56(a5) -; RV64I-NEXT: lw t5, 60(a5) -; RV64I-NEXT: lw t6, 64(a5) -; RV64I-NEXT: lw s0, 68(a5) -; RV64I-NEXT: lw s1, 32(a5) -; RV64I-NEXT: lw s2, 36(a5) -; RV64I-NEXT: lw s3, 40(a5) -; RV64I-NEXT: lw s4, 44(a5) -; RV64I-NEXT: sw s0, 68(a5) -; RV64I-NEXT: sw t6, 64(a5) -; RV64I-NEXT: sw t5, 60(a5) -; RV64I-NEXT: sw t4, 56(a5) -; RV64I-NEXT: sw t3, 52(a5) -; RV64I-NEXT: sw t2, 48(a5) -; RV64I-NEXT: sw s4, 44(a5) -; RV64I-NEXT: sw s3, 40(a5) -; RV64I-NEXT: sw s2, 36(a5) -; RV64I-NEXT: sw s1, 32(a5) +; RV64I-NEXT: lw t2, 32(a5) +; RV64I-NEXT: lw t3, 36(a5) +; RV64I-NEXT: lw t4, 40(a5) +; RV64I-NEXT: lw t5, 44(a5) +; RV64I-NEXT: lw t6, 48(a5) +; RV64I-NEXT: lw s0, 52(a5) +; RV64I-NEXT: lw s1, 56(a5) +; RV64I-NEXT: lw s2, 60(a5) +; RV64I-NEXT: lw s3, 64(a5) +; RV64I-NEXT: lw s4, 68(a5) +; RV64I-NEXT: sw s4, 68(a5) +; RV64I-NEXT: sw s3, 64(a5) +; RV64I-NEXT: sw s2, 60(a5) +; RV64I-NEXT: sw s1, 56(a5) +; RV64I-NEXT: sw s0, 52(a5) +; RV64I-NEXT: sw t6, 48(a5) +; RV64I-NEXT: sw t5, 44(a5) +; RV64I-NEXT: sw t4, 40(a5) +; RV64I-NEXT: sw t3, 36(a5) +; RV64I-NEXT: sw t2, 32(a5) ; RV64I-NEXT: sw t1, 28(a5) ; RV64I-NEXT: sw t0, 24(a5) ; RV64I-NEXT: sw a7, 20(a5) @@ -2323,16 +2323,16 @@ define void @callee_with_irq() "interrupt"="user" { ; RV32IZCMP-NEXT: .cfi_offset t4, -104 ; RV32IZCMP-NEXT: .cfi_offset t5, -108 ; RV32IZCMP-NEXT: .cfi_offset t6, -112 -; RV32IZCMP-NEXT: lui t0, %hi(var_test_irq) -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(t0) +; RV32IZCMP-NEXT: lui a4, %hi(var_test_irq) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(t0) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(t0) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(t0) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, t0, %lo(var_test_irq) +; RV32IZCMP-NEXT: addi a5, a4, %lo(var_test_irq) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -2352,28 +2352,28 @@ define void @callee_with_irq() "interrupt"="user" { ; RV32IZCMP-NEXT: lw s11, 72(a5) ; RV32IZCMP-NEXT: lw ra, 76(a5) ; RV32IZCMP-NEXT: lw s1, 80(a5) -; RV32IZCMP-NEXT: lw t3, 84(a5) -; RV32IZCMP-NEXT: lw t2, 88(a5) -; RV32IZCMP-NEXT: lw t1, 92(a5) -; RV32IZCMP-NEXT: lw a7, 112(a5) -; RV32IZCMP-NEXT: lw s0, 116(a5) -; RV32IZCMP-NEXT: lw a3, 120(a5) -; RV32IZCMP-NEXT: lw a0, 124(a5) -; RV32IZCMP-NEXT: lw a6, 96(a5) -; RV32IZCMP-NEXT: lw a4, 100(a5) -; RV32IZCMP-NEXT: lw a2, 104(a5) -; RV32IZCMP-NEXT: lw a1, 108(a5) -; RV32IZCMP-NEXT: sw a0, 124(a5) -; RV32IZCMP-NEXT: sw a3, 120(a5) -; RV32IZCMP-NEXT: sw s0, 116(a5) -; RV32IZCMP-NEXT: sw a7, 112(a5) -; RV32IZCMP-NEXT: sw a1, 108(a5) -; RV32IZCMP-NEXT: sw a2, 104(a5) -; RV32IZCMP-NEXT: sw a4, 100(a5) -; RV32IZCMP-NEXT: sw a6, 96(a5) -; RV32IZCMP-NEXT: sw t1, 92(a5) -; RV32IZCMP-NEXT: sw t2, 88(a5) -; RV32IZCMP-NEXT: sw t3, 84(a5) +; RV32IZCMP-NEXT: lw t2, 84(a5) +; RV32IZCMP-NEXT: lw t1, 88(a5) +; RV32IZCMP-NEXT: lw t0, 92(a5) +; RV32IZCMP-NEXT: lw a7, 96(a5) +; RV32IZCMP-NEXT: lw s0, 100(a5) +; RV32IZCMP-NEXT: lw a6, 104(a5) +; RV32IZCMP-NEXT: lw a3, 108(a5) +; RV32IZCMP-NEXT: lw a2, 112(a5) +; RV32IZCMP-NEXT: lw a1, 116(a5) +; RV32IZCMP-NEXT: lw a0, 120(a5) +; RV32IZCMP-NEXT: lw t3, 124(a5) +; RV32IZCMP-NEXT: sw t3, 124(a5) +; RV32IZCMP-NEXT: sw a0, 120(a5) +; RV32IZCMP-NEXT: sw a1, 116(a5) +; RV32IZCMP-NEXT: sw a2, 112(a5) +; RV32IZCMP-NEXT: sw a3, 108(a5) +; RV32IZCMP-NEXT: sw a6, 104(a5) +; RV32IZCMP-NEXT: sw s0, 100(a5) +; RV32IZCMP-NEXT: sw a7, 96(a5) +; RV32IZCMP-NEXT: sw t0, 92(a5) +; RV32IZCMP-NEXT: sw t1, 88(a5) +; RV32IZCMP-NEXT: sw t2, 84(a5) ; RV32IZCMP-NEXT: sw s1, 80(a5) ; RV32IZCMP-NEXT: sw ra, 76(a5) ; RV32IZCMP-NEXT: sw s11, 72(a5) @@ -2394,13 +2394,13 @@ define void @callee_with_irq() "interrupt"="user" { ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV32IZCMP-NEXT: lw t0, 88(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: lw t1, 84(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: lw t2, 80(sp) # 4-byte Folded Reload @@ -2499,16 +2499,16 @@ define void @callee_with_irq() "interrupt"="user" { ; RV64IZCMP-NEXT: .cfi_offset t4, -208 ; RV64IZCMP-NEXT: .cfi_offset t5, -216 ; RV64IZCMP-NEXT: .cfi_offset t6, -224 -; RV64IZCMP-NEXT: lui t0, %hi(var_test_irq) -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(t0) +; RV64IZCMP-NEXT: lui a4, %hi(var_test_irq) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(t0) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(t0) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(t0) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, t0, %lo(var_test_irq) +; RV64IZCMP-NEXT: addi a5, a4, %lo(var_test_irq) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -2528,28 +2528,28 @@ define void @callee_with_irq() "interrupt"="user" { ; RV64IZCMP-NEXT: lw s11, 72(a5) ; RV64IZCMP-NEXT: lw ra, 76(a5) ; RV64IZCMP-NEXT: lw s1, 80(a5) -; RV64IZCMP-NEXT: lw t3, 84(a5) -; RV64IZCMP-NEXT: lw t2, 88(a5) -; RV64IZCMP-NEXT: lw t1, 92(a5) -; RV64IZCMP-NEXT: lw a7, 112(a5) -; RV64IZCMP-NEXT: lw s0, 116(a5) -; RV64IZCMP-NEXT: lw a3, 120(a5) -; RV64IZCMP-NEXT: lw a0, 124(a5) -; RV64IZCMP-NEXT: lw a6, 96(a5) -; RV64IZCMP-NEXT: lw a4, 100(a5) -; RV64IZCMP-NEXT: lw a2, 104(a5) -; RV64IZCMP-NEXT: lw a1, 108(a5) -; RV64IZCMP-NEXT: sw a0, 124(a5) -; RV64IZCMP-NEXT: sw a3, 120(a5) -; RV64IZCMP-NEXT: sw s0, 116(a5) -; RV64IZCMP-NEXT: sw a7, 112(a5) -; RV64IZCMP-NEXT: sw a1, 108(a5) -; RV64IZCMP-NEXT: sw a2, 104(a5) -; RV64IZCMP-NEXT: sw a4, 100(a5) -; RV64IZCMP-NEXT: sw a6, 96(a5) -; RV64IZCMP-NEXT: sw t1, 92(a5) -; RV64IZCMP-NEXT: sw t2, 88(a5) -; RV64IZCMP-NEXT: sw t3, 84(a5) +; RV64IZCMP-NEXT: lw t2, 84(a5) +; RV64IZCMP-NEXT: lw t1, 88(a5) +; RV64IZCMP-NEXT: lw t0, 92(a5) +; RV64IZCMP-NEXT: lw a7, 96(a5) +; RV64IZCMP-NEXT: lw s0, 100(a5) +; RV64IZCMP-NEXT: lw a6, 104(a5) +; RV64IZCMP-NEXT: lw a3, 108(a5) +; RV64IZCMP-NEXT: lw a2, 112(a5) +; RV64IZCMP-NEXT: lw a1, 116(a5) +; RV64IZCMP-NEXT: lw a0, 120(a5) +; RV64IZCMP-NEXT: lw t3, 124(a5) +; RV64IZCMP-NEXT: sw t3, 124(a5) +; RV64IZCMP-NEXT: sw a0, 120(a5) +; RV64IZCMP-NEXT: sw a1, 116(a5) +; RV64IZCMP-NEXT: sw a2, 112(a5) +; RV64IZCMP-NEXT: sw a3, 108(a5) +; RV64IZCMP-NEXT: sw a6, 104(a5) +; RV64IZCMP-NEXT: sw s0, 100(a5) +; RV64IZCMP-NEXT: sw a7, 96(a5) +; RV64IZCMP-NEXT: sw t0, 92(a5) +; RV64IZCMP-NEXT: sw t1, 88(a5) +; RV64IZCMP-NEXT: sw t2, 84(a5) ; RV64IZCMP-NEXT: sw s1, 80(a5) ; RV64IZCMP-NEXT: sw ra, 76(a5) ; RV64IZCMP-NEXT: sw s11, 72(a5) @@ -2570,13 +2570,13 @@ define void @callee_with_irq() "interrupt"="user" { ; RV64IZCMP-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV64IZCMP-NEXT: ld t0, 160(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: ld t1, 152(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: ld t2, 144(sp) # 8-byte Folded Reload @@ -2675,16 +2675,16 @@ define void @callee_with_irq() "interrupt"="user" { ; RV32IZCMP-SR-NEXT: .cfi_offset t4, -104 ; RV32IZCMP-SR-NEXT: .cfi_offset t5, -108 ; RV32IZCMP-SR-NEXT: .cfi_offset t6, -112 -; RV32IZCMP-SR-NEXT: lui t0, %hi(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(t0) +; RV32IZCMP-SR-NEXT: lui a4, %hi(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV32IZCMP-SR-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(t0) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV32IZCMP-SR-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(t0) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV32IZCMP-SR-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(t0) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV32IZCMP-SR-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: addi a5, t0, %lo(var_test_irq) +; RV32IZCMP-SR-NEXT: addi a5, a4, %lo(var_test_irq) ; RV32IZCMP-SR-NEXT: lw a0, 16(a5) ; RV32IZCMP-SR-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2704,28 +2704,28 @@ define void @callee_with_irq() "interrupt"="user" { ; RV32IZCMP-SR-NEXT: lw s11, 72(a5) ; RV32IZCMP-SR-NEXT: lw ra, 76(a5) ; RV32IZCMP-SR-NEXT: lw s1, 80(a5) -; RV32IZCMP-SR-NEXT: lw t3, 84(a5) -; RV32IZCMP-SR-NEXT: lw t2, 88(a5) -; RV32IZCMP-SR-NEXT: lw t1, 92(a5) -; RV32IZCMP-SR-NEXT: lw a7, 112(a5) -; RV32IZCMP-SR-NEXT: lw s0, 116(a5) -; RV32IZCMP-SR-NEXT: lw a3, 120(a5) -; RV32IZCMP-SR-NEXT: lw a0, 124(a5) -; RV32IZCMP-SR-NEXT: lw a6, 96(a5) -; RV32IZCMP-SR-NEXT: lw a4, 100(a5) -; RV32IZCMP-SR-NEXT: lw a2, 104(a5) -; RV32IZCMP-SR-NEXT: lw a1, 108(a5) -; RV32IZCMP-SR-NEXT: sw a0, 124(a5) -; RV32IZCMP-SR-NEXT: sw a3, 120(a5) -; RV32IZCMP-SR-NEXT: sw s0, 116(a5) -; RV32IZCMP-SR-NEXT: sw a7, 112(a5) -; RV32IZCMP-SR-NEXT: sw a1, 108(a5) -; RV32IZCMP-SR-NEXT: sw a2, 104(a5) -; RV32IZCMP-SR-NEXT: sw a4, 100(a5) -; RV32IZCMP-SR-NEXT: sw a6, 96(a5) -; RV32IZCMP-SR-NEXT: sw t1, 92(a5) -; RV32IZCMP-SR-NEXT: sw t2, 88(a5) -; RV32IZCMP-SR-NEXT: sw t3, 84(a5) +; RV32IZCMP-SR-NEXT: lw t2, 84(a5) +; RV32IZCMP-SR-NEXT: lw t1, 88(a5) +; RV32IZCMP-SR-NEXT: lw t0, 92(a5) +; RV32IZCMP-SR-NEXT: lw a7, 96(a5) +; RV32IZCMP-SR-NEXT: lw s0, 100(a5) +; RV32IZCMP-SR-NEXT: lw a6, 104(a5) +; RV32IZCMP-SR-NEXT: lw a3, 108(a5) +; RV32IZCMP-SR-NEXT: lw a2, 112(a5) +; RV32IZCMP-SR-NEXT: lw a1, 116(a5) +; RV32IZCMP-SR-NEXT: lw a0, 120(a5) +; RV32IZCMP-SR-NEXT: lw t3, 124(a5) +; RV32IZCMP-SR-NEXT: sw t3, 124(a5) +; RV32IZCMP-SR-NEXT: sw a0, 120(a5) +; RV32IZCMP-SR-NEXT: sw a1, 116(a5) +; RV32IZCMP-SR-NEXT: sw a2, 112(a5) +; RV32IZCMP-SR-NEXT: sw a3, 108(a5) +; RV32IZCMP-SR-NEXT: sw a6, 104(a5) +; RV32IZCMP-SR-NEXT: sw s0, 100(a5) +; RV32IZCMP-SR-NEXT: sw a7, 96(a5) +; RV32IZCMP-SR-NEXT: sw t0, 92(a5) +; RV32IZCMP-SR-NEXT: sw t1, 88(a5) +; RV32IZCMP-SR-NEXT: sw t2, 84(a5) ; RV32IZCMP-SR-NEXT: sw s1, 80(a5) ; RV32IZCMP-SR-NEXT: sw ra, 76(a5) ; RV32IZCMP-SR-NEXT: sw s11, 72(a5) @@ -2746,13 +2746,13 @@ define void @callee_with_irq() "interrupt"="user" { ; RV32IZCMP-SR-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: sw a0, 16(a5) ; RV32IZCMP-SR-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(t0) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV32IZCMP-SR-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(t0) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV32IZCMP-SR-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(t0) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV32IZCMP-SR-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(t0) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV32IZCMP-SR-NEXT: lw t0, 88(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: lw t1, 84(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: lw t2, 80(sp) # 4-byte Folded Reload @@ -2851,16 +2851,16 @@ define void @callee_with_irq() "interrupt"="user" { ; RV64IZCMP-SR-NEXT: .cfi_offset t4, -208 ; RV64IZCMP-SR-NEXT: .cfi_offset t5, -216 ; RV64IZCMP-SR-NEXT: .cfi_offset t6, -224 -; RV64IZCMP-SR-NEXT: lui t0, %hi(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(t0) +; RV64IZCMP-SR-NEXT: lui a4, %hi(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV64IZCMP-SR-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(t0) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV64IZCMP-SR-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(t0) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV64IZCMP-SR-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(t0) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV64IZCMP-SR-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: addi a5, t0, %lo(var_test_irq) +; RV64IZCMP-SR-NEXT: addi a5, a4, %lo(var_test_irq) ; RV64IZCMP-SR-NEXT: lw a0, 16(a5) ; RV64IZCMP-SR-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2880,28 +2880,28 @@ define void @callee_with_irq() "interrupt"="user" { ; RV64IZCMP-SR-NEXT: lw s11, 72(a5) ; RV64IZCMP-SR-NEXT: lw ra, 76(a5) ; RV64IZCMP-SR-NEXT: lw s1, 80(a5) -; RV64IZCMP-SR-NEXT: lw t3, 84(a5) -; RV64IZCMP-SR-NEXT: lw t2, 88(a5) -; RV64IZCMP-SR-NEXT: lw t1, 92(a5) -; RV64IZCMP-SR-NEXT: lw a7, 112(a5) -; RV64IZCMP-SR-NEXT: lw s0, 116(a5) -; RV64IZCMP-SR-NEXT: lw a3, 120(a5) -; RV64IZCMP-SR-NEXT: lw a0, 124(a5) -; RV64IZCMP-SR-NEXT: lw a6, 96(a5) -; RV64IZCMP-SR-NEXT: lw a4, 100(a5) -; RV64IZCMP-SR-NEXT: lw a2, 104(a5) -; RV64IZCMP-SR-NEXT: lw a1, 108(a5) -; RV64IZCMP-SR-NEXT: sw a0, 124(a5) -; RV64IZCMP-SR-NEXT: sw a3, 120(a5) -; RV64IZCMP-SR-NEXT: sw s0, 116(a5) -; RV64IZCMP-SR-NEXT: sw a7, 112(a5) -; RV64IZCMP-SR-NEXT: sw a1, 108(a5) -; RV64IZCMP-SR-NEXT: sw a2, 104(a5) -; RV64IZCMP-SR-NEXT: sw a4, 100(a5) -; RV64IZCMP-SR-NEXT: sw a6, 96(a5) -; RV64IZCMP-SR-NEXT: sw t1, 92(a5) -; RV64IZCMP-SR-NEXT: sw t2, 88(a5) -; RV64IZCMP-SR-NEXT: sw t3, 84(a5) +; RV64IZCMP-SR-NEXT: lw t2, 84(a5) +; RV64IZCMP-SR-NEXT: lw t1, 88(a5) +; RV64IZCMP-SR-NEXT: lw t0, 92(a5) +; RV64IZCMP-SR-NEXT: lw a7, 96(a5) +; RV64IZCMP-SR-NEXT: lw s0, 100(a5) +; RV64IZCMP-SR-NEXT: lw a6, 104(a5) +; RV64IZCMP-SR-NEXT: lw a3, 108(a5) +; RV64IZCMP-SR-NEXT: lw a2, 112(a5) +; RV64IZCMP-SR-NEXT: lw a1, 116(a5) +; RV64IZCMP-SR-NEXT: lw a0, 120(a5) +; RV64IZCMP-SR-NEXT: lw t3, 124(a5) +; RV64IZCMP-SR-NEXT: sw t3, 124(a5) +; RV64IZCMP-SR-NEXT: sw a0, 120(a5) +; RV64IZCMP-SR-NEXT: sw a1, 116(a5) +; RV64IZCMP-SR-NEXT: sw a2, 112(a5) +; RV64IZCMP-SR-NEXT: sw a3, 108(a5) +; RV64IZCMP-SR-NEXT: sw a6, 104(a5) +; RV64IZCMP-SR-NEXT: sw s0, 100(a5) +; RV64IZCMP-SR-NEXT: sw a7, 96(a5) +; RV64IZCMP-SR-NEXT: sw t0, 92(a5) +; RV64IZCMP-SR-NEXT: sw t1, 88(a5) +; RV64IZCMP-SR-NEXT: sw t2, 84(a5) ; RV64IZCMP-SR-NEXT: sw s1, 80(a5) ; RV64IZCMP-SR-NEXT: sw ra, 76(a5) ; RV64IZCMP-SR-NEXT: sw s11, 72(a5) @@ -2922,13 +2922,13 @@ define void @callee_with_irq() "interrupt"="user" { ; RV64IZCMP-SR-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: sw a0, 16(a5) ; RV64IZCMP-SR-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(t0) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV64IZCMP-SR-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(t0) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV64IZCMP-SR-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(t0) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV64IZCMP-SR-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(t0) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV64IZCMP-SR-NEXT: ld t0, 160(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: ld t1, 152(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: ld t2, 144(sp) # 8-byte Folded Reload @@ -3038,16 +3038,16 @@ define void @callee_with_irq() "interrupt"="user" { ; RV32I-NEXT: .cfi_offset t4, -104 ; RV32I-NEXT: .cfi_offset t5, -108 ; RV32I-NEXT: .cfi_offset t6, -112 -; RV32I-NEXT: lui a7, %hi(var_test_irq) -; RV32I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: lui a4, %hi(var_test_irq) +; RV32I-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV32I-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32I-NEXT: addi a5, a4, %lo(var_test_irq) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -3070,22 +3070,22 @@ define void @callee_with_irq() "interrupt"="user" { ; RV32I-NEXT: lw s8, 84(a5) ; RV32I-NEXT: lw s9, 88(a5) ; RV32I-NEXT: lw s10, 92(a5) -; RV32I-NEXT: lw s11, 112(a5) -; RV32I-NEXT: lw ra, 116(a5) -; RV32I-NEXT: lw a3, 120(a5) -; RV32I-NEXT: lw a0, 124(a5) -; RV32I-NEXT: lw a6, 96(a5) -; RV32I-NEXT: lw a4, 100(a5) -; RV32I-NEXT: lw a2, 104(a5) -; RV32I-NEXT: lw a1, 108(a5) -; RV32I-NEXT: sw a0, 124(a5) -; RV32I-NEXT: sw a3, 120(a5) -; RV32I-NEXT: sw ra, 116(a5) -; RV32I-NEXT: sw s11, 112(a5) -; RV32I-NEXT: sw a1, 108(a5) -; RV32I-NEXT: sw a2, 104(a5) -; RV32I-NEXT: sw a4, 100(a5) -; RV32I-NEXT: sw a6, 96(a5) +; RV32I-NEXT: lw s11, 96(a5) +; RV32I-NEXT: lw ra, 100(a5) +; RV32I-NEXT: lw a6, 104(a5) +; RV32I-NEXT: lw a3, 108(a5) +; RV32I-NEXT: lw a2, 112(a5) +; RV32I-NEXT: lw a1, 116(a5) +; RV32I-NEXT: lw a0, 120(a5) +; RV32I-NEXT: lw a7, 124(a5) +; RV32I-NEXT: sw a7, 124(a5) +; RV32I-NEXT: sw a0, 120(a5) +; RV32I-NEXT: sw a1, 116(a5) +; RV32I-NEXT: sw a2, 112(a5) +; RV32I-NEXT: sw a3, 108(a5) +; RV32I-NEXT: sw a6, 104(a5) +; RV32I-NEXT: sw ra, 100(a5) +; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) ; RV32I-NEXT: sw s9, 88(a5) ; RV32I-NEXT: sw s8, 84(a5) @@ -3109,13 +3109,13 @@ define void @callee_with_irq() "interrupt"="user" { ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV32I-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw t0, 136(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw t1, 132(sp) # 4-byte Folded Reload @@ -3236,16 +3236,16 @@ define void @callee_with_irq() "interrupt"="user" { ; RV64I-NEXT: .cfi_offset t4, -208 ; RV64I-NEXT: .cfi_offset t5, -216 ; RV64I-NEXT: .cfi_offset t6, -224 -; RV64I-NEXT: lui a7, %hi(var_test_irq) -; RV64I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: lui a4, %hi(var_test_irq) +; RV64I-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64I-NEXT: addi a5, a4, %lo(var_test_irq) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -3268,22 +3268,22 @@ define void @callee_with_irq() "interrupt"="user" { ; RV64I-NEXT: lw s8, 84(a5) ; RV64I-NEXT: lw s9, 88(a5) ; RV64I-NEXT: lw s10, 92(a5) -; RV64I-NEXT: lw s11, 112(a5) -; RV64I-NEXT: lw ra, 116(a5) -; RV64I-NEXT: lw a3, 120(a5) -; RV64I-NEXT: lw a0, 124(a5) -; RV64I-NEXT: lw a6, 96(a5) -; RV64I-NEXT: lw a4, 100(a5) -; RV64I-NEXT: lw a2, 104(a5) -; RV64I-NEXT: lw a1, 108(a5) -; RV64I-NEXT: sw a0, 124(a5) -; RV64I-NEXT: sw a3, 120(a5) -; RV64I-NEXT: sw ra, 116(a5) -; RV64I-NEXT: sw s11, 112(a5) -; RV64I-NEXT: sw a1, 108(a5) -; RV64I-NEXT: sw a2, 104(a5) -; RV64I-NEXT: sw a4, 100(a5) -; RV64I-NEXT: sw a6, 96(a5) +; RV64I-NEXT: lw s11, 96(a5) +; RV64I-NEXT: lw ra, 100(a5) +; RV64I-NEXT: lw a6, 104(a5) +; RV64I-NEXT: lw a3, 108(a5) +; RV64I-NEXT: lw a2, 112(a5) +; RV64I-NEXT: lw a1, 116(a5) +; RV64I-NEXT: lw a0, 120(a5) +; RV64I-NEXT: lw a7, 124(a5) +; RV64I-NEXT: sw a7, 124(a5) +; RV64I-NEXT: sw a0, 120(a5) +; RV64I-NEXT: sw a1, 116(a5) +; RV64I-NEXT: sw a2, 112(a5) +; RV64I-NEXT: sw a3, 108(a5) +; RV64I-NEXT: sw a6, 104(a5) +; RV64I-NEXT: sw ra, 100(a5) +; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) ; RV64I-NEXT: sw s9, 88(a5) ; RV64I-NEXT: sw s8, 84(a5) @@ -3307,13 +3307,13 @@ define void @callee_with_irq() "interrupt"="user" { ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV64I-NEXT: ld ra, 264(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld t0, 256(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld t1, 248(sp) # 8-byte Folded Reload @@ -3396,16 +3396,16 @@ define void @callee_no_irq() { ; RV32IZCMP-NEXT: .cfi_offset s9, -12 ; RV32IZCMP-NEXT: .cfi_offset s10, -8 ; RV32IZCMP-NEXT: .cfi_offset s11, -4 -; RV32IZCMP-NEXT: lui t0, %hi(var_test_irq) -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(t0) +; RV32IZCMP-NEXT: lui a4, %hi(var_test_irq) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(t0) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(t0) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(t0) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, t0, %lo(var_test_irq) +; RV32IZCMP-NEXT: addi a5, a4, %lo(var_test_irq) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -3425,28 +3425,28 @@ define void @callee_no_irq() { ; RV32IZCMP-NEXT: lw s11, 72(a5) ; RV32IZCMP-NEXT: lw ra, 76(a5) ; RV32IZCMP-NEXT: lw s1, 80(a5) -; RV32IZCMP-NEXT: lw t3, 84(a5) -; RV32IZCMP-NEXT: lw t2, 88(a5) -; RV32IZCMP-NEXT: lw t1, 92(a5) -; RV32IZCMP-NEXT: lw a7, 112(a5) -; RV32IZCMP-NEXT: lw s0, 116(a5) -; RV32IZCMP-NEXT: lw a3, 120(a5) -; RV32IZCMP-NEXT: lw a0, 124(a5) -; RV32IZCMP-NEXT: lw a6, 96(a5) -; RV32IZCMP-NEXT: lw a4, 100(a5) -; RV32IZCMP-NEXT: lw a2, 104(a5) -; RV32IZCMP-NEXT: lw a1, 108(a5) -; RV32IZCMP-NEXT: sw a0, 124(a5) -; RV32IZCMP-NEXT: sw a3, 120(a5) -; RV32IZCMP-NEXT: sw s0, 116(a5) -; RV32IZCMP-NEXT: sw a7, 112(a5) -; RV32IZCMP-NEXT: sw a1, 108(a5) -; RV32IZCMP-NEXT: sw a2, 104(a5) -; RV32IZCMP-NEXT: sw a4, 100(a5) -; RV32IZCMP-NEXT: sw a6, 96(a5) -; RV32IZCMP-NEXT: sw t1, 92(a5) -; RV32IZCMP-NEXT: sw t2, 88(a5) -; RV32IZCMP-NEXT: sw t3, 84(a5) +; RV32IZCMP-NEXT: lw t2, 84(a5) +; RV32IZCMP-NEXT: lw t1, 88(a5) +; RV32IZCMP-NEXT: lw t0, 92(a5) +; RV32IZCMP-NEXT: lw a7, 96(a5) +; RV32IZCMP-NEXT: lw s0, 100(a5) +; RV32IZCMP-NEXT: lw a6, 104(a5) +; RV32IZCMP-NEXT: lw a3, 108(a5) +; RV32IZCMP-NEXT: lw a2, 112(a5) +; RV32IZCMP-NEXT: lw a1, 116(a5) +; RV32IZCMP-NEXT: lw a0, 120(a5) +; RV32IZCMP-NEXT: lw t3, 124(a5) +; RV32IZCMP-NEXT: sw t3, 124(a5) +; RV32IZCMP-NEXT: sw a0, 120(a5) +; RV32IZCMP-NEXT: sw a1, 116(a5) +; RV32IZCMP-NEXT: sw a2, 112(a5) +; RV32IZCMP-NEXT: sw a3, 108(a5) +; RV32IZCMP-NEXT: sw a6, 104(a5) +; RV32IZCMP-NEXT: sw s0, 100(a5) +; RV32IZCMP-NEXT: sw a7, 96(a5) +; RV32IZCMP-NEXT: sw t0, 92(a5) +; RV32IZCMP-NEXT: sw t1, 88(a5) +; RV32IZCMP-NEXT: sw t2, 84(a5) ; RV32IZCMP-NEXT: sw s1, 80(a5) ; RV32IZCMP-NEXT: sw ra, 76(a5) ; RV32IZCMP-NEXT: sw s11, 72(a5) @@ -3467,13 +3467,13 @@ define void @callee_no_irq() { ; RV32IZCMP-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(t0) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 80 ; ; RV64IZCMP-LABEL: callee_no_irq: @@ -3493,16 +3493,16 @@ define void @callee_no_irq() { ; RV64IZCMP-NEXT: .cfi_offset s9, -24 ; RV64IZCMP-NEXT: .cfi_offset s10, -16 ; RV64IZCMP-NEXT: .cfi_offset s11, -8 -; RV64IZCMP-NEXT: lui t0, %hi(var_test_irq) -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(t0) +; RV64IZCMP-NEXT: lui a4, %hi(var_test_irq) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV64IZCMP-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(t0) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(t0) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(t0) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, t0, %lo(var_test_irq) +; RV64IZCMP-NEXT: addi a5, a4, %lo(var_test_irq) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -3522,28 +3522,28 @@ define void @callee_no_irq() { ; RV64IZCMP-NEXT: lw s11, 72(a5) ; RV64IZCMP-NEXT: lw ra, 76(a5) ; RV64IZCMP-NEXT: lw s1, 80(a5) -; RV64IZCMP-NEXT: lw t3, 84(a5) -; RV64IZCMP-NEXT: lw t2, 88(a5) -; RV64IZCMP-NEXT: lw t1, 92(a5) -; RV64IZCMP-NEXT: lw a7, 112(a5) -; RV64IZCMP-NEXT: lw s0, 116(a5) -; RV64IZCMP-NEXT: lw a3, 120(a5) -; RV64IZCMP-NEXT: lw a0, 124(a5) -; RV64IZCMP-NEXT: lw a6, 96(a5) -; RV64IZCMP-NEXT: lw a4, 100(a5) -; RV64IZCMP-NEXT: lw a2, 104(a5) -; RV64IZCMP-NEXT: lw a1, 108(a5) -; RV64IZCMP-NEXT: sw a0, 124(a5) -; RV64IZCMP-NEXT: sw a3, 120(a5) -; RV64IZCMP-NEXT: sw s0, 116(a5) -; RV64IZCMP-NEXT: sw a7, 112(a5) -; RV64IZCMP-NEXT: sw a1, 108(a5) -; RV64IZCMP-NEXT: sw a2, 104(a5) -; RV64IZCMP-NEXT: sw a4, 100(a5) -; RV64IZCMP-NEXT: sw a6, 96(a5) -; RV64IZCMP-NEXT: sw t1, 92(a5) -; RV64IZCMP-NEXT: sw t2, 88(a5) -; RV64IZCMP-NEXT: sw t3, 84(a5) +; RV64IZCMP-NEXT: lw t2, 84(a5) +; RV64IZCMP-NEXT: lw t1, 88(a5) +; RV64IZCMP-NEXT: lw t0, 92(a5) +; RV64IZCMP-NEXT: lw a7, 96(a5) +; RV64IZCMP-NEXT: lw s0, 100(a5) +; RV64IZCMP-NEXT: lw a6, 104(a5) +; RV64IZCMP-NEXT: lw a3, 108(a5) +; RV64IZCMP-NEXT: lw a2, 112(a5) +; RV64IZCMP-NEXT: lw a1, 116(a5) +; RV64IZCMP-NEXT: lw a0, 120(a5) +; RV64IZCMP-NEXT: lw t3, 124(a5) +; RV64IZCMP-NEXT: sw t3, 124(a5) +; RV64IZCMP-NEXT: sw a0, 120(a5) +; RV64IZCMP-NEXT: sw a1, 116(a5) +; RV64IZCMP-NEXT: sw a2, 112(a5) +; RV64IZCMP-NEXT: sw a3, 108(a5) +; RV64IZCMP-NEXT: sw a6, 104(a5) +; RV64IZCMP-NEXT: sw s0, 100(a5) +; RV64IZCMP-NEXT: sw a7, 96(a5) +; RV64IZCMP-NEXT: sw t0, 92(a5) +; RV64IZCMP-NEXT: sw t1, 88(a5) +; RV64IZCMP-NEXT: sw t2, 84(a5) ; RV64IZCMP-NEXT: sw s1, 80(a5) ; RV64IZCMP-NEXT: sw ra, 76(a5) ; RV64IZCMP-NEXT: sw s11, 72(a5) @@ -3564,13 +3564,13 @@ define void @callee_no_irq() { ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV64IZCMP-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(t0) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV32IZCMP-SR-LABEL: callee_no_irq: @@ -3590,16 +3590,16 @@ define void @callee_no_irq() { ; RV32IZCMP-SR-NEXT: .cfi_offset s9, -12 ; RV32IZCMP-SR-NEXT: .cfi_offset s10, -8 ; RV32IZCMP-SR-NEXT: .cfi_offset s11, -4 -; RV32IZCMP-SR-NEXT: lui t0, %hi(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(t0) +; RV32IZCMP-SR-NEXT: lui a4, %hi(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV32IZCMP-SR-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(t0) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV32IZCMP-SR-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(t0) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV32IZCMP-SR-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(t0) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV32IZCMP-SR-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: addi a5, t0, %lo(var_test_irq) +; RV32IZCMP-SR-NEXT: addi a5, a4, %lo(var_test_irq) ; RV32IZCMP-SR-NEXT: lw a0, 16(a5) ; RV32IZCMP-SR-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: lw a0, 20(a5) @@ -3619,28 +3619,28 @@ define void @callee_no_irq() { ; RV32IZCMP-SR-NEXT: lw s11, 72(a5) ; RV32IZCMP-SR-NEXT: lw ra, 76(a5) ; RV32IZCMP-SR-NEXT: lw s1, 80(a5) -; RV32IZCMP-SR-NEXT: lw t3, 84(a5) -; RV32IZCMP-SR-NEXT: lw t2, 88(a5) -; RV32IZCMP-SR-NEXT: lw t1, 92(a5) -; RV32IZCMP-SR-NEXT: lw a7, 112(a5) -; RV32IZCMP-SR-NEXT: lw s0, 116(a5) -; RV32IZCMP-SR-NEXT: lw a3, 120(a5) -; RV32IZCMP-SR-NEXT: lw a0, 124(a5) -; RV32IZCMP-SR-NEXT: lw a6, 96(a5) -; RV32IZCMP-SR-NEXT: lw a4, 100(a5) -; RV32IZCMP-SR-NEXT: lw a2, 104(a5) -; RV32IZCMP-SR-NEXT: lw a1, 108(a5) -; RV32IZCMP-SR-NEXT: sw a0, 124(a5) -; RV32IZCMP-SR-NEXT: sw a3, 120(a5) -; RV32IZCMP-SR-NEXT: sw s0, 116(a5) -; RV32IZCMP-SR-NEXT: sw a7, 112(a5) -; RV32IZCMP-SR-NEXT: sw a1, 108(a5) -; RV32IZCMP-SR-NEXT: sw a2, 104(a5) -; RV32IZCMP-SR-NEXT: sw a4, 100(a5) -; RV32IZCMP-SR-NEXT: sw a6, 96(a5) -; RV32IZCMP-SR-NEXT: sw t1, 92(a5) -; RV32IZCMP-SR-NEXT: sw t2, 88(a5) -; RV32IZCMP-SR-NEXT: sw t3, 84(a5) +; RV32IZCMP-SR-NEXT: lw t2, 84(a5) +; RV32IZCMP-SR-NEXT: lw t1, 88(a5) +; RV32IZCMP-SR-NEXT: lw t0, 92(a5) +; RV32IZCMP-SR-NEXT: lw a7, 96(a5) +; RV32IZCMP-SR-NEXT: lw s0, 100(a5) +; RV32IZCMP-SR-NEXT: lw a6, 104(a5) +; RV32IZCMP-SR-NEXT: lw a3, 108(a5) +; RV32IZCMP-SR-NEXT: lw a2, 112(a5) +; RV32IZCMP-SR-NEXT: lw a1, 116(a5) +; RV32IZCMP-SR-NEXT: lw a0, 120(a5) +; RV32IZCMP-SR-NEXT: lw t3, 124(a5) +; RV32IZCMP-SR-NEXT: sw t3, 124(a5) +; RV32IZCMP-SR-NEXT: sw a0, 120(a5) +; RV32IZCMP-SR-NEXT: sw a1, 116(a5) +; RV32IZCMP-SR-NEXT: sw a2, 112(a5) +; RV32IZCMP-SR-NEXT: sw a3, 108(a5) +; RV32IZCMP-SR-NEXT: sw a6, 104(a5) +; RV32IZCMP-SR-NEXT: sw s0, 100(a5) +; RV32IZCMP-SR-NEXT: sw a7, 96(a5) +; RV32IZCMP-SR-NEXT: sw t0, 92(a5) +; RV32IZCMP-SR-NEXT: sw t1, 88(a5) +; RV32IZCMP-SR-NEXT: sw t2, 84(a5) ; RV32IZCMP-SR-NEXT: sw s1, 80(a5) ; RV32IZCMP-SR-NEXT: sw ra, 76(a5) ; RV32IZCMP-SR-NEXT: sw s11, 72(a5) @@ -3661,13 +3661,13 @@ define void @callee_no_irq() { ; RV32IZCMP-SR-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: sw a0, 16(a5) ; RV32IZCMP-SR-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(t0) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV32IZCMP-SR-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(t0) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV32IZCMP-SR-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(t0) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV32IZCMP-SR-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(t0) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV32IZCMP-SR-NEXT: cm.popret {ra, s0-s11}, 80 ; ; RV64IZCMP-SR-LABEL: callee_no_irq: @@ -3687,16 +3687,16 @@ define void @callee_no_irq() { ; RV64IZCMP-SR-NEXT: .cfi_offset s9, -24 ; RV64IZCMP-SR-NEXT: .cfi_offset s10, -16 ; RV64IZCMP-SR-NEXT: .cfi_offset s11, -8 -; RV64IZCMP-SR-NEXT: lui t0, %hi(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(t0) +; RV64IZCMP-SR-NEXT: lui a4, %hi(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV64IZCMP-SR-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(t0) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV64IZCMP-SR-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(t0) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV64IZCMP-SR-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(t0) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV64IZCMP-SR-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: addi a5, t0, %lo(var_test_irq) +; RV64IZCMP-SR-NEXT: addi a5, a4, %lo(var_test_irq) ; RV64IZCMP-SR-NEXT: lw a0, 16(a5) ; RV64IZCMP-SR-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: lw a0, 20(a5) @@ -3716,28 +3716,28 @@ define void @callee_no_irq() { ; RV64IZCMP-SR-NEXT: lw s11, 72(a5) ; RV64IZCMP-SR-NEXT: lw ra, 76(a5) ; RV64IZCMP-SR-NEXT: lw s1, 80(a5) -; RV64IZCMP-SR-NEXT: lw t3, 84(a5) -; RV64IZCMP-SR-NEXT: lw t2, 88(a5) -; RV64IZCMP-SR-NEXT: lw t1, 92(a5) -; RV64IZCMP-SR-NEXT: lw a7, 112(a5) -; RV64IZCMP-SR-NEXT: lw s0, 116(a5) -; RV64IZCMP-SR-NEXT: lw a3, 120(a5) -; RV64IZCMP-SR-NEXT: lw a0, 124(a5) -; RV64IZCMP-SR-NEXT: lw a6, 96(a5) -; RV64IZCMP-SR-NEXT: lw a4, 100(a5) -; RV64IZCMP-SR-NEXT: lw a2, 104(a5) -; RV64IZCMP-SR-NEXT: lw a1, 108(a5) -; RV64IZCMP-SR-NEXT: sw a0, 124(a5) -; RV64IZCMP-SR-NEXT: sw a3, 120(a5) -; RV64IZCMP-SR-NEXT: sw s0, 116(a5) -; RV64IZCMP-SR-NEXT: sw a7, 112(a5) -; RV64IZCMP-SR-NEXT: sw a1, 108(a5) -; RV64IZCMP-SR-NEXT: sw a2, 104(a5) -; RV64IZCMP-SR-NEXT: sw a4, 100(a5) -; RV64IZCMP-SR-NEXT: sw a6, 96(a5) -; RV64IZCMP-SR-NEXT: sw t1, 92(a5) -; RV64IZCMP-SR-NEXT: sw t2, 88(a5) -; RV64IZCMP-SR-NEXT: sw t3, 84(a5) +; RV64IZCMP-SR-NEXT: lw t2, 84(a5) +; RV64IZCMP-SR-NEXT: lw t1, 88(a5) +; RV64IZCMP-SR-NEXT: lw t0, 92(a5) +; RV64IZCMP-SR-NEXT: lw a7, 96(a5) +; RV64IZCMP-SR-NEXT: lw s0, 100(a5) +; RV64IZCMP-SR-NEXT: lw a6, 104(a5) +; RV64IZCMP-SR-NEXT: lw a3, 108(a5) +; RV64IZCMP-SR-NEXT: lw a2, 112(a5) +; RV64IZCMP-SR-NEXT: lw a1, 116(a5) +; RV64IZCMP-SR-NEXT: lw a0, 120(a5) +; RV64IZCMP-SR-NEXT: lw t3, 124(a5) +; RV64IZCMP-SR-NEXT: sw t3, 124(a5) +; RV64IZCMP-SR-NEXT: sw a0, 120(a5) +; RV64IZCMP-SR-NEXT: sw a1, 116(a5) +; RV64IZCMP-SR-NEXT: sw a2, 112(a5) +; RV64IZCMP-SR-NEXT: sw a3, 108(a5) +; RV64IZCMP-SR-NEXT: sw a6, 104(a5) +; RV64IZCMP-SR-NEXT: sw s0, 100(a5) +; RV64IZCMP-SR-NEXT: sw a7, 96(a5) +; RV64IZCMP-SR-NEXT: sw t0, 92(a5) +; RV64IZCMP-SR-NEXT: sw t1, 88(a5) +; RV64IZCMP-SR-NEXT: sw t2, 84(a5) ; RV64IZCMP-SR-NEXT: sw s1, 80(a5) ; RV64IZCMP-SR-NEXT: sw ra, 76(a5) ; RV64IZCMP-SR-NEXT: sw s11, 72(a5) @@ -3758,13 +3758,13 @@ define void @callee_no_irq() { ; RV64IZCMP-SR-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: sw a0, 16(a5) ; RV64IZCMP-SR-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(t0) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV64IZCMP-SR-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(t0) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV64IZCMP-SR-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(t0) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV64IZCMP-SR-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(t0) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV64IZCMP-SR-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV32I-LABEL: callee_no_irq: @@ -3797,16 +3797,16 @@ define void @callee_no_irq() { ; RV32I-NEXT: .cfi_offset s9, -44 ; RV32I-NEXT: .cfi_offset s10, -48 ; RV32I-NEXT: .cfi_offset s11, -52 -; RV32I-NEXT: lui a7, %hi(var_test_irq) -; RV32I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: lui a4, %hi(var_test_irq) +; RV32I-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32I-NEXT: addi a5, a4, %lo(var_test_irq) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -3829,22 +3829,22 @@ define void @callee_no_irq() { ; RV32I-NEXT: lw s8, 84(a5) ; RV32I-NEXT: lw s9, 88(a5) ; RV32I-NEXT: lw s10, 92(a5) -; RV32I-NEXT: lw s11, 112(a5) -; RV32I-NEXT: lw ra, 116(a5) -; RV32I-NEXT: lw a3, 120(a5) -; RV32I-NEXT: lw a0, 124(a5) -; RV32I-NEXT: lw a6, 96(a5) -; RV32I-NEXT: lw a4, 100(a5) -; RV32I-NEXT: lw a2, 104(a5) -; RV32I-NEXT: lw a1, 108(a5) -; RV32I-NEXT: sw a0, 124(a5) -; RV32I-NEXT: sw a3, 120(a5) -; RV32I-NEXT: sw ra, 116(a5) -; RV32I-NEXT: sw s11, 112(a5) -; RV32I-NEXT: sw a1, 108(a5) -; RV32I-NEXT: sw a2, 104(a5) -; RV32I-NEXT: sw a4, 100(a5) -; RV32I-NEXT: sw a6, 96(a5) +; RV32I-NEXT: lw s11, 96(a5) +; RV32I-NEXT: lw ra, 100(a5) +; RV32I-NEXT: lw a6, 104(a5) +; RV32I-NEXT: lw a3, 108(a5) +; RV32I-NEXT: lw a2, 112(a5) +; RV32I-NEXT: lw a1, 116(a5) +; RV32I-NEXT: lw a0, 120(a5) +; RV32I-NEXT: lw a7, 124(a5) +; RV32I-NEXT: sw a7, 124(a5) +; RV32I-NEXT: sw a0, 120(a5) +; RV32I-NEXT: sw a1, 116(a5) +; RV32I-NEXT: sw a2, 112(a5) +; RV32I-NEXT: sw a3, 108(a5) +; RV32I-NEXT: sw a6, 104(a5) +; RV32I-NEXT: sw ra, 100(a5) +; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) ; RV32I-NEXT: sw s9, 88(a5) ; RV32I-NEXT: sw s8, 84(a5) @@ -3868,13 +3868,13 @@ define void @callee_no_irq() { ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -3935,16 +3935,16 @@ define void @callee_no_irq() { ; RV64I-NEXT: .cfi_offset s9, -88 ; RV64I-NEXT: .cfi_offset s10, -96 ; RV64I-NEXT: .cfi_offset s11, -104 -; RV64I-NEXT: lui a7, %hi(var_test_irq) -; RV64I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: lui a4, %hi(var_test_irq) +; RV64I-NEXT: lw a0, %lo(var_test_irq)(a4) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a4) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a4) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a4) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64I-NEXT: addi a5, a4, %lo(var_test_irq) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -3967,22 +3967,22 @@ define void @callee_no_irq() { ; RV64I-NEXT: lw s8, 84(a5) ; RV64I-NEXT: lw s9, 88(a5) ; RV64I-NEXT: lw s10, 92(a5) -; RV64I-NEXT: lw s11, 112(a5) -; RV64I-NEXT: lw ra, 116(a5) -; RV64I-NEXT: lw a3, 120(a5) -; RV64I-NEXT: lw a0, 124(a5) -; RV64I-NEXT: lw a6, 96(a5) -; RV64I-NEXT: lw a4, 100(a5) -; RV64I-NEXT: lw a2, 104(a5) -; RV64I-NEXT: lw a1, 108(a5) -; RV64I-NEXT: sw a0, 124(a5) -; RV64I-NEXT: sw a3, 120(a5) -; RV64I-NEXT: sw ra, 116(a5) -; RV64I-NEXT: sw s11, 112(a5) -; RV64I-NEXT: sw a1, 108(a5) -; RV64I-NEXT: sw a2, 104(a5) -; RV64I-NEXT: sw a4, 100(a5) -; RV64I-NEXT: sw a6, 96(a5) +; RV64I-NEXT: lw s11, 96(a5) +; RV64I-NEXT: lw ra, 100(a5) +; RV64I-NEXT: lw a6, 104(a5) +; RV64I-NEXT: lw a3, 108(a5) +; RV64I-NEXT: lw a2, 112(a5) +; RV64I-NEXT: lw a1, 116(a5) +; RV64I-NEXT: lw a0, 120(a5) +; RV64I-NEXT: lw a7, 124(a5) +; RV64I-NEXT: sw a7, 124(a5) +; RV64I-NEXT: sw a0, 120(a5) +; RV64I-NEXT: sw a1, 116(a5) +; RV64I-NEXT: sw a2, 112(a5) +; RV64I-NEXT: sw a3, 108(a5) +; RV64I-NEXT: sw a6, 104(a5) +; RV64I-NEXT: sw ra, 100(a5) +; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) ; RV64I-NEXT: sw s9, 88(a5) ; RV64I-NEXT: sw s8, 84(a5) @@ -4006,13 +4006,13 @@ define void @callee_no_irq() { ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a4) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a4) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a4) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq)(a4) ; RV64I-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 136(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll index 32261ee47164e..c53e6dc3b8089 100644 --- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll +++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll @@ -50,8 +50,8 @@ define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: blez a1, .LBB1_7 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: li a3, 1 ; CHECK-NEXT: andi a2, a1, 1 +; CHECK-NEXT: li a3, 1 ; CHECK-NEXT: bne a1, a3, .LBB1_3 ; CHECK-NEXT: # %bb.2: ; CHECK-NEXT: li a3, 0 diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll index 634cca5dcdb71..5522e3c9a0fb9 100644 --- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll +++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll @@ -119,8 +119,8 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; RV32I-LABEL: rotl_64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a5, a2, -32 ; RV32I-NEXT: sll a4, a0, a2 +; RV32I-NEXT: addi a5, a2, -32 ; RV32I-NEXT: bltz a5, .LBB2_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a3, a4 @@ -167,8 +167,8 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; ; RV32ZBB-LABEL: rotl_64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: addi a5, a2, -32 ; RV32ZBB-NEXT: sll a4, a0, a2 +; RV32ZBB-NEXT: addi a5, a2, -32 ; RV32ZBB-NEXT: bltz a5, .LBB2_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: mv a3, a4 @@ -212,8 +212,8 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; ; RV32XTHEADBB-LABEL: rotl_64: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: addi a5, a2, -32 ; RV32XTHEADBB-NEXT: sll a4, a0, a2 +; RV32XTHEADBB-NEXT: addi a5, a2, -32 ; RV32XTHEADBB-NEXT: bltz a5, .LBB2_2 ; RV32XTHEADBB-NEXT: # %bb.1: ; RV32XTHEADBB-NEXT: mv a3, a4 @@ -267,8 +267,8 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; RV32I-LABEL: rotr_64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a5, a2, -32 ; RV32I-NEXT: srl a4, a1, a2 +; RV32I-NEXT: addi a5, a2, -32 ; RV32I-NEXT: bltz a5, .LBB3_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a3, a4 @@ -315,8 +315,8 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; ; RV32ZBB-LABEL: rotr_64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: addi a5, a2, -32 ; RV32ZBB-NEXT: srl a4, a1, a2 +; RV32ZBB-NEXT: addi a5, a2, -32 ; RV32ZBB-NEXT: bltz a5, .LBB3_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: mv a3, a4 @@ -360,8 +360,8 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; ; RV32XTHEADBB-LABEL: rotr_64: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: addi a5, a2, -32 ; RV32XTHEADBB-NEXT: srl a4, a1, a2 +; RV32XTHEADBB-NEXT: addi a5, a2, -32 ; RV32XTHEADBB-NEXT: bltz a5, .LBB3_2 ; RV32XTHEADBB-NEXT: # %bb.1: ; RV32XTHEADBB-NEXT: mv a3, a4 @@ -707,8 +707,8 @@ define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind { define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; RV32I-LABEL: rotl_64_mask: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a5, a2, -32 ; RV32I-NEXT: sll a4, a0, a2 +; RV32I-NEXT: addi a5, a2, -32 ; RV32I-NEXT: bltz a5, .LBB10_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a3, a4 @@ -720,24 +720,24 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; RV32I-NEXT: srl a6, a7, a6 ; RV32I-NEXT: or a3, a3, a6 ; RV32I-NEXT: .LBB10_3: -; RV32I-NEXT: srai t0, a5, 31 +; RV32I-NEXT: srai a6, a5, 31 ; RV32I-NEXT: neg a5, a2 -; RV32I-NEXT: andi a7, a5, 63 -; RV32I-NEXT: addi a6, a7, -32 -; RV32I-NEXT: and a2, t0, a4 -; RV32I-NEXT: bltz a6, .LBB10_5 +; RV32I-NEXT: and a2, a6, a4 +; RV32I-NEXT: andi a6, a5, 63 +; RV32I-NEXT: addi a4, a6, -32 +; RV32I-NEXT: bltz a4, .LBB10_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: srl a0, a1, a7 +; RV32I-NEXT: srl a0, a1, a6 ; RV32I-NEXT: j .LBB10_6 ; RV32I-NEXT: .LBB10_5: ; RV32I-NEXT: srl a0, a0, a5 -; RV32I-NEXT: not a4, a7 +; RV32I-NEXT: not a6, a6 ; RV32I-NEXT: slli a7, a1, 1 -; RV32I-NEXT: sll a4, a7, a4 -; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: sll a6, a7, a6 +; RV32I-NEXT: or a0, a0, a6 ; RV32I-NEXT: .LBB10_6: ; RV32I-NEXT: srl a1, a1, a5 -; RV32I-NEXT: srai a4, a6, 31 +; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a1, a4, a1 ; RV32I-NEXT: or a1, a3, a1 ; RV32I-NEXT: or a0, a2, a0 @@ -753,8 +753,8 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV32ZBB-LABEL: rotl_64_mask: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: addi a5, a2, -32 ; RV32ZBB-NEXT: sll a4, a0, a2 +; RV32ZBB-NEXT: addi a5, a2, -32 ; RV32ZBB-NEXT: bltz a5, .LBB10_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: mv a3, a4 @@ -766,24 +766,24 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; RV32ZBB-NEXT: srl a6, a7, a6 ; RV32ZBB-NEXT: or a3, a3, a6 ; RV32ZBB-NEXT: .LBB10_3: -; RV32ZBB-NEXT: srai t0, a5, 31 +; RV32ZBB-NEXT: srai a6, a5, 31 ; RV32ZBB-NEXT: neg a5, a2 -; RV32ZBB-NEXT: andi a7, a5, 63 -; RV32ZBB-NEXT: addi a6, a7, -32 -; RV32ZBB-NEXT: and a2, t0, a4 -; RV32ZBB-NEXT: bltz a6, .LBB10_5 +; RV32ZBB-NEXT: and a2, a6, a4 +; RV32ZBB-NEXT: andi a6, a5, 63 +; RV32ZBB-NEXT: addi a4, a6, -32 +; RV32ZBB-NEXT: bltz a4, .LBB10_5 ; RV32ZBB-NEXT: # %bb.4: -; RV32ZBB-NEXT: srl a0, a1, a7 +; RV32ZBB-NEXT: srl a0, a1, a6 ; RV32ZBB-NEXT: j .LBB10_6 ; RV32ZBB-NEXT: .LBB10_5: ; RV32ZBB-NEXT: srl a0, a0, a5 -; RV32ZBB-NEXT: not a4, a7 +; RV32ZBB-NEXT: not a6, a6 ; RV32ZBB-NEXT: slli a7, a1, 1 -; RV32ZBB-NEXT: sll a4, a7, a4 -; RV32ZBB-NEXT: or a0, a0, a4 +; RV32ZBB-NEXT: sll a6, a7, a6 +; RV32ZBB-NEXT: or a0, a0, a6 ; RV32ZBB-NEXT: .LBB10_6: ; RV32ZBB-NEXT: srl a1, a1, a5 -; RV32ZBB-NEXT: srai a4, a6, 31 +; RV32ZBB-NEXT: srai a4, a4, 31 ; RV32ZBB-NEXT: and a1, a4, a1 ; RV32ZBB-NEXT: or a1, a3, a1 ; RV32ZBB-NEXT: or a0, a2, a0 @@ -796,8 +796,8 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV32XTHEADBB-LABEL: rotl_64_mask: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: addi a5, a2, -32 ; RV32XTHEADBB-NEXT: sll a4, a0, a2 +; RV32XTHEADBB-NEXT: addi a5, a2, -32 ; RV32XTHEADBB-NEXT: bltz a5, .LBB10_2 ; RV32XTHEADBB-NEXT: # %bb.1: ; RV32XTHEADBB-NEXT: mv a3, a4 @@ -809,24 +809,24 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; RV32XTHEADBB-NEXT: srl a6, a7, a6 ; RV32XTHEADBB-NEXT: or a3, a3, a6 ; RV32XTHEADBB-NEXT: .LBB10_3: -; RV32XTHEADBB-NEXT: srai t0, a5, 31 +; RV32XTHEADBB-NEXT: srai a6, a5, 31 ; RV32XTHEADBB-NEXT: neg a5, a2 -; RV32XTHEADBB-NEXT: andi a7, a5, 63 -; RV32XTHEADBB-NEXT: addi a6, a7, -32 -; RV32XTHEADBB-NEXT: and a2, t0, a4 -; RV32XTHEADBB-NEXT: bltz a6, .LBB10_5 +; RV32XTHEADBB-NEXT: and a2, a6, a4 +; RV32XTHEADBB-NEXT: andi a6, a5, 63 +; RV32XTHEADBB-NEXT: addi a4, a6, -32 +; RV32XTHEADBB-NEXT: bltz a4, .LBB10_5 ; RV32XTHEADBB-NEXT: # %bb.4: -; RV32XTHEADBB-NEXT: srl a0, a1, a7 +; RV32XTHEADBB-NEXT: srl a0, a1, a6 ; RV32XTHEADBB-NEXT: j .LBB10_6 ; RV32XTHEADBB-NEXT: .LBB10_5: ; RV32XTHEADBB-NEXT: srl a0, a0, a5 -; RV32XTHEADBB-NEXT: not a4, a7 +; RV32XTHEADBB-NEXT: not a6, a6 ; RV32XTHEADBB-NEXT: slli a7, a1, 1 -; RV32XTHEADBB-NEXT: sll a4, a7, a4 -; RV32XTHEADBB-NEXT: or a0, a0, a4 +; RV32XTHEADBB-NEXT: sll a6, a7, a6 +; RV32XTHEADBB-NEXT: or a0, a0, a6 ; RV32XTHEADBB-NEXT: .LBB10_6: ; RV32XTHEADBB-NEXT: srl a1, a1, a5 -; RV32XTHEADBB-NEXT: srai a4, a6, 31 +; RV32XTHEADBB-NEXT: srai a4, a4, 31 ; RV32XTHEADBB-NEXT: and a1, a4, a1 ; RV32XTHEADBB-NEXT: or a1, a3, a1 ; RV32XTHEADBB-NEXT: or a0, a2, a0 @@ -863,12 +863,12 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV32I-NEXT: srl a3, a6, a3 ; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: .LBB11_3: -; RV32I-NEXT: sll a7, a0, a2 -; RV32I-NEXT: srai t0, a4, 31 +; RV32I-NEXT: sll a5, a0, a2 +; RV32I-NEXT: srai a6, a4, 31 ; RV32I-NEXT: neg a4, a2 +; RV32I-NEXT: and a2, a6, a5 ; RV32I-NEXT: andi a6, a4, 63 ; RV32I-NEXT: addi a5, a6, -32 -; RV32I-NEXT: and a2, t0, a7 ; RV32I-NEXT: bltz a5, .LBB11_5 ; RV32I-NEXT: # %bb.4: ; RV32I-NEXT: srl a0, a1, a6 @@ -910,12 +910,12 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV32ZBB-NEXT: srl a3, a6, a3 ; RV32ZBB-NEXT: or a3, a5, a3 ; RV32ZBB-NEXT: .LBB11_3: -; RV32ZBB-NEXT: sll a7, a0, a2 -; RV32ZBB-NEXT: srai t0, a4, 31 +; RV32ZBB-NEXT: sll a5, a0, a2 +; RV32ZBB-NEXT: srai a6, a4, 31 ; RV32ZBB-NEXT: neg a4, a2 +; RV32ZBB-NEXT: and a2, a6, a5 ; RV32ZBB-NEXT: andi a6, a4, 63 ; RV32ZBB-NEXT: addi a5, a6, -32 -; RV32ZBB-NEXT: and a2, t0, a7 ; RV32ZBB-NEXT: bltz a5, .LBB11_5 ; RV32ZBB-NEXT: # %bb.4: ; RV32ZBB-NEXT: srl a0, a1, a6 @@ -954,12 +954,12 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV32XTHEADBB-NEXT: srl a3, a6, a3 ; RV32XTHEADBB-NEXT: or a3, a5, a3 ; RV32XTHEADBB-NEXT: .LBB11_3: -; RV32XTHEADBB-NEXT: sll a7, a0, a2 -; RV32XTHEADBB-NEXT: srai t0, a4, 31 +; RV32XTHEADBB-NEXT: sll a5, a0, a2 +; RV32XTHEADBB-NEXT: srai a6, a4, 31 ; RV32XTHEADBB-NEXT: neg a4, a2 +; RV32XTHEADBB-NEXT: and a2, a6, a5 ; RV32XTHEADBB-NEXT: andi a6, a4, 63 ; RV32XTHEADBB-NEXT: addi a5, a6, -32 -; RV32XTHEADBB-NEXT: and a2, t0, a7 ; RV32XTHEADBB-NEXT: bltz a5, .LBB11_5 ; RV32XTHEADBB-NEXT: # %bb.4: ; RV32XTHEADBB-NEXT: srl a0, a1, a6 @@ -1042,8 +1042,8 @@ define i64 @rotl_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind { define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; RV32I-LABEL: rotr_64_mask: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a5, a2, -32 ; RV32I-NEXT: srl a4, a1, a2 +; RV32I-NEXT: addi a5, a2, -32 ; RV32I-NEXT: bltz a5, .LBB13_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a3, a4 @@ -1055,24 +1055,24 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; RV32I-NEXT: sll a6, a7, a6 ; RV32I-NEXT: or a3, a3, a6 ; RV32I-NEXT: .LBB13_3: -; RV32I-NEXT: srai t0, a5, 31 +; RV32I-NEXT: srai a6, a5, 31 ; RV32I-NEXT: neg a5, a2 -; RV32I-NEXT: andi a7, a5, 63 -; RV32I-NEXT: addi a6, a7, -32 -; RV32I-NEXT: and a2, t0, a4 -; RV32I-NEXT: bltz a6, .LBB13_5 +; RV32I-NEXT: and a2, a6, a4 +; RV32I-NEXT: andi a6, a5, 63 +; RV32I-NEXT: addi a4, a6, -32 +; RV32I-NEXT: bltz a4, .LBB13_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: sll a1, a0, a7 +; RV32I-NEXT: sll a1, a0, a6 ; RV32I-NEXT: j .LBB13_6 ; RV32I-NEXT: .LBB13_5: ; RV32I-NEXT: sll a1, a1, a5 -; RV32I-NEXT: not a4, a7 +; RV32I-NEXT: not a6, a6 ; RV32I-NEXT: srli a7, a0, 1 -; RV32I-NEXT: srl a4, a7, a4 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: srl a6, a7, a6 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: .LBB13_6: ; RV32I-NEXT: sll a0, a0, a5 -; RV32I-NEXT: srai a4, a6, 31 +; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a0, a4, a0 ; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: or a1, a2, a1 @@ -1088,8 +1088,8 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV32ZBB-LABEL: rotr_64_mask: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: addi a5, a2, -32 ; RV32ZBB-NEXT: srl a4, a1, a2 +; RV32ZBB-NEXT: addi a5, a2, -32 ; RV32ZBB-NEXT: bltz a5, .LBB13_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: mv a3, a4 @@ -1101,24 +1101,24 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; RV32ZBB-NEXT: sll a6, a7, a6 ; RV32ZBB-NEXT: or a3, a3, a6 ; RV32ZBB-NEXT: .LBB13_3: -; RV32ZBB-NEXT: srai t0, a5, 31 +; RV32ZBB-NEXT: srai a6, a5, 31 ; RV32ZBB-NEXT: neg a5, a2 -; RV32ZBB-NEXT: andi a7, a5, 63 -; RV32ZBB-NEXT: addi a6, a7, -32 -; RV32ZBB-NEXT: and a2, t0, a4 -; RV32ZBB-NEXT: bltz a6, .LBB13_5 +; RV32ZBB-NEXT: and a2, a6, a4 +; RV32ZBB-NEXT: andi a6, a5, 63 +; RV32ZBB-NEXT: addi a4, a6, -32 +; RV32ZBB-NEXT: bltz a4, .LBB13_5 ; RV32ZBB-NEXT: # %bb.4: -; RV32ZBB-NEXT: sll a1, a0, a7 +; RV32ZBB-NEXT: sll a1, a0, a6 ; RV32ZBB-NEXT: j .LBB13_6 ; RV32ZBB-NEXT: .LBB13_5: ; RV32ZBB-NEXT: sll a1, a1, a5 -; RV32ZBB-NEXT: not a4, a7 +; RV32ZBB-NEXT: not a6, a6 ; RV32ZBB-NEXT: srli a7, a0, 1 -; RV32ZBB-NEXT: srl a4, a7, a4 -; RV32ZBB-NEXT: or a1, a1, a4 +; RV32ZBB-NEXT: srl a6, a7, a6 +; RV32ZBB-NEXT: or a1, a1, a6 ; RV32ZBB-NEXT: .LBB13_6: ; RV32ZBB-NEXT: sll a0, a0, a5 -; RV32ZBB-NEXT: srai a4, a6, 31 +; RV32ZBB-NEXT: srai a4, a4, 31 ; RV32ZBB-NEXT: and a0, a4, a0 ; RV32ZBB-NEXT: or a0, a3, a0 ; RV32ZBB-NEXT: or a1, a2, a1 @@ -1131,8 +1131,8 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV32XTHEADBB-LABEL: rotr_64_mask: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: addi a5, a2, -32 ; RV32XTHEADBB-NEXT: srl a4, a1, a2 +; RV32XTHEADBB-NEXT: addi a5, a2, -32 ; RV32XTHEADBB-NEXT: bltz a5, .LBB13_2 ; RV32XTHEADBB-NEXT: # %bb.1: ; RV32XTHEADBB-NEXT: mv a3, a4 @@ -1144,24 +1144,24 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; RV32XTHEADBB-NEXT: sll a6, a7, a6 ; RV32XTHEADBB-NEXT: or a3, a3, a6 ; RV32XTHEADBB-NEXT: .LBB13_3: -; RV32XTHEADBB-NEXT: srai t0, a5, 31 +; RV32XTHEADBB-NEXT: srai a6, a5, 31 ; RV32XTHEADBB-NEXT: neg a5, a2 -; RV32XTHEADBB-NEXT: andi a7, a5, 63 -; RV32XTHEADBB-NEXT: addi a6, a7, -32 -; RV32XTHEADBB-NEXT: and a2, t0, a4 -; RV32XTHEADBB-NEXT: bltz a6, .LBB13_5 +; RV32XTHEADBB-NEXT: and a2, a6, a4 +; RV32XTHEADBB-NEXT: andi a6, a5, 63 +; RV32XTHEADBB-NEXT: addi a4, a6, -32 +; RV32XTHEADBB-NEXT: bltz a4, .LBB13_5 ; RV32XTHEADBB-NEXT: # %bb.4: -; RV32XTHEADBB-NEXT: sll a1, a0, a7 +; RV32XTHEADBB-NEXT: sll a1, a0, a6 ; RV32XTHEADBB-NEXT: j .LBB13_6 ; RV32XTHEADBB-NEXT: .LBB13_5: ; RV32XTHEADBB-NEXT: sll a1, a1, a5 -; RV32XTHEADBB-NEXT: not a4, a7 +; RV32XTHEADBB-NEXT: not a6, a6 ; RV32XTHEADBB-NEXT: srli a7, a0, 1 -; RV32XTHEADBB-NEXT: srl a4, a7, a4 -; RV32XTHEADBB-NEXT: or a1, a1, a4 +; RV32XTHEADBB-NEXT: srl a6, a7, a6 +; RV32XTHEADBB-NEXT: or a1, a1, a6 ; RV32XTHEADBB-NEXT: .LBB13_6: ; RV32XTHEADBB-NEXT: sll a0, a0, a5 -; RV32XTHEADBB-NEXT: srai a4, a6, 31 +; RV32XTHEADBB-NEXT: srai a4, a4, 31 ; RV32XTHEADBB-NEXT: and a0, a4, a0 ; RV32XTHEADBB-NEXT: or a0, a3, a0 ; RV32XTHEADBB-NEXT: or a1, a2, a1 @@ -1198,12 +1198,12 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV32I-NEXT: sll a3, a6, a3 ; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: .LBB14_3: -; RV32I-NEXT: srl a7, a1, a2 -; RV32I-NEXT: srai t0, a4, 31 +; RV32I-NEXT: srl a5, a1, a2 +; RV32I-NEXT: srai a6, a4, 31 ; RV32I-NEXT: neg a4, a2 +; RV32I-NEXT: and a2, a6, a5 ; RV32I-NEXT: andi a6, a4, 63 ; RV32I-NEXT: addi a5, a6, -32 -; RV32I-NEXT: and a2, t0, a7 ; RV32I-NEXT: bltz a5, .LBB14_5 ; RV32I-NEXT: # %bb.4: ; RV32I-NEXT: sll a1, a0, a6 @@ -1245,12 +1245,12 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV32ZBB-NEXT: sll a3, a6, a3 ; RV32ZBB-NEXT: or a3, a5, a3 ; RV32ZBB-NEXT: .LBB14_3: -; RV32ZBB-NEXT: srl a7, a1, a2 -; RV32ZBB-NEXT: srai t0, a4, 31 +; RV32ZBB-NEXT: srl a5, a1, a2 +; RV32ZBB-NEXT: srai a6, a4, 31 ; RV32ZBB-NEXT: neg a4, a2 +; RV32ZBB-NEXT: and a2, a6, a5 ; RV32ZBB-NEXT: andi a6, a4, 63 ; RV32ZBB-NEXT: addi a5, a6, -32 -; RV32ZBB-NEXT: and a2, t0, a7 ; RV32ZBB-NEXT: bltz a5, .LBB14_5 ; RV32ZBB-NEXT: # %bb.4: ; RV32ZBB-NEXT: sll a1, a0, a6 @@ -1289,12 +1289,12 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV32XTHEADBB-NEXT: sll a3, a6, a3 ; RV32XTHEADBB-NEXT: or a3, a5, a3 ; RV32XTHEADBB-NEXT: .LBB14_3: -; RV32XTHEADBB-NEXT: srl a7, a1, a2 -; RV32XTHEADBB-NEXT: srai t0, a4, 31 +; RV32XTHEADBB-NEXT: srl a5, a1, a2 +; RV32XTHEADBB-NEXT: srai a6, a4, 31 ; RV32XTHEADBB-NEXT: neg a4, a2 +; RV32XTHEADBB-NEXT: and a2, a6, a5 ; RV32XTHEADBB-NEXT: andi a6, a4, 63 ; RV32XTHEADBB-NEXT: addi a5, a6, -32 -; RV32XTHEADBB-NEXT: and a2, t0, a7 ; RV32XTHEADBB-NEXT: bltz a5, .LBB14_5 ; RV32XTHEADBB-NEXT: # %bb.4: ; RV32XTHEADBB-NEXT: sll a1, a0, a6 @@ -1458,11 +1458,11 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV32I-NEXT: not t0, a4 ; RV32I-NEXT: sll t1, a1, a4 ; RV32I-NEXT: srli a1, a6, 1 -; RV32I-NEXT: srl a6, a0, t0 -; RV32I-NEXT: srl t0, a1, t0 +; RV32I-NEXT: srl a0, a0, t0 +; RV32I-NEXT: srl a6, a1, t0 +; RV32I-NEXT: or a1, a7, a0 +; RV32I-NEXT: or a6, t1, a6 ; RV32I-NEXT: addi a0, a5, -32 -; RV32I-NEXT: or a1, a7, a6 -; RV32I-NEXT: or a6, t1, t0 ; RV32I-NEXT: bltz a0, .LBB17_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: sll a3, a2, a5 @@ -1512,11 +1512,11 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV32ZBB-NEXT: not t0, a4 ; RV32ZBB-NEXT: sll t1, a1, a4 ; RV32ZBB-NEXT: srli a1, a6, 1 -; RV32ZBB-NEXT: srl a6, a0, t0 -; RV32ZBB-NEXT: srl t0, a1, t0 +; RV32ZBB-NEXT: srl a0, a0, t0 +; RV32ZBB-NEXT: srl a6, a1, t0 +; RV32ZBB-NEXT: or a1, a7, a0 +; RV32ZBB-NEXT: or a6, t1, a6 ; RV32ZBB-NEXT: addi a0, a5, -32 -; RV32ZBB-NEXT: or a1, a7, a6 -; RV32ZBB-NEXT: or a6, t1, t0 ; RV32ZBB-NEXT: bltz a0, .LBB17_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: sll a3, a2, a5 @@ -1562,11 +1562,11 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV32XTHEADBB-NEXT: not t0, a4 ; RV32XTHEADBB-NEXT: sll t1, a1, a4 ; RV32XTHEADBB-NEXT: srli a1, a6, 1 -; RV32XTHEADBB-NEXT: srl a6, a0, t0 -; RV32XTHEADBB-NEXT: srl t0, a1, t0 +; RV32XTHEADBB-NEXT: srl a0, a0, t0 +; RV32XTHEADBB-NEXT: srl a6, a1, t0 +; RV32XTHEADBB-NEXT: or a1, a7, a0 +; RV32XTHEADBB-NEXT: or a6, t1, a6 ; RV32XTHEADBB-NEXT: addi a0, a5, -32 -; RV32XTHEADBB-NEXT: or a1, a7, a6 -; RV32XTHEADBB-NEXT: or a6, t1, t0 ; RV32XTHEADBB-NEXT: bltz a0, .LBB17_6 ; RV32XTHEADBB-NEXT: # %bb.5: ; RV32XTHEADBB-NEXT: sll a3, a2, a5 @@ -1683,13 +1683,13 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV32I-NEXT: .LBB19_4: ; RV32I-NEXT: slli a1, a0, 1 ; RV32I-NEXT: not t0, a4 -; RV32I-NEXT: srl t1, a0, a4 +; RV32I-NEXT: srl a0, a0, a4 ; RV32I-NEXT: slli a6, a6, 1 ; RV32I-NEXT: sll a1, a1, t0 ; RV32I-NEXT: sll a6, a6, t0 -; RV32I-NEXT: addi a0, a5, -32 ; RV32I-NEXT: or a1, a1, a7 -; RV32I-NEXT: or a6, a6, t1 +; RV32I-NEXT: or a6, a6, a0 +; RV32I-NEXT: addi a0, a5, -32 ; RV32I-NEXT: bltz a0, .LBB19_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: sll a3, a2, a5 @@ -1736,13 +1736,13 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV32ZBB-NEXT: .LBB19_4: ; RV32ZBB-NEXT: slli a1, a0, 1 ; RV32ZBB-NEXT: not t0, a4 -; RV32ZBB-NEXT: srl t1, a0, a4 +; RV32ZBB-NEXT: srl a0, a0, a4 ; RV32ZBB-NEXT: slli a6, a6, 1 ; RV32ZBB-NEXT: sll a1, a1, t0 ; RV32ZBB-NEXT: sll a6, a6, t0 -; RV32ZBB-NEXT: addi a0, a5, -32 ; RV32ZBB-NEXT: or a1, a1, a7 -; RV32ZBB-NEXT: or a6, a6, t1 +; RV32ZBB-NEXT: or a6, a6, a0 +; RV32ZBB-NEXT: addi a0, a5, -32 ; RV32ZBB-NEXT: bltz a0, .LBB19_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: sll a3, a2, a5 @@ -1786,13 +1786,13 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV32XTHEADBB-NEXT: .LBB19_4: ; RV32XTHEADBB-NEXT: slli a1, a0, 1 ; RV32XTHEADBB-NEXT: not t0, a4 -; RV32XTHEADBB-NEXT: srl t1, a0, a4 +; RV32XTHEADBB-NEXT: srl a0, a0, a4 ; RV32XTHEADBB-NEXT: slli a6, a6, 1 ; RV32XTHEADBB-NEXT: sll a1, a1, t0 ; RV32XTHEADBB-NEXT: sll a6, a6, t0 -; RV32XTHEADBB-NEXT: addi a0, a5, -32 ; RV32XTHEADBB-NEXT: or a1, a1, a7 -; RV32XTHEADBB-NEXT: or a6, a6, t1 +; RV32XTHEADBB-NEXT: or a6, a6, a0 +; RV32XTHEADBB-NEXT: addi a0, a5, -32 ; RV32XTHEADBB-NEXT: bltz a0, .LBB19_6 ; RV32XTHEADBB-NEXT: # %bb.5: ; RV32XTHEADBB-NEXT: sll a3, a2, a5 @@ -2314,8 +2314,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV32I-LABEL: rotl_64_zext: ; RV32I: # %bb.0: ; RV32I-NEXT: neg a4, a2 -; RV32I-NEXT: addi a6, a2, -32 ; RV32I-NEXT: sll a5, a0, a2 +; RV32I-NEXT: addi a6, a2, -32 ; RV32I-NEXT: bltz a6, .LBB24_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a3, a5 @@ -2362,8 +2362,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV32ZBB-LABEL: rotl_64_zext: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: neg a4, a2 -; RV32ZBB-NEXT: addi a6, a2, -32 ; RV32ZBB-NEXT: sll a5, a0, a2 +; RV32ZBB-NEXT: addi a6, a2, -32 ; RV32ZBB-NEXT: bltz a6, .LBB24_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: mv a3, a5 @@ -2407,8 +2407,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV32XTHEADBB-LABEL: rotl_64_zext: ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: neg a4, a2 -; RV32XTHEADBB-NEXT: addi a6, a2, -32 ; RV32XTHEADBB-NEXT: sll a5, a0, a2 +; RV32XTHEADBB-NEXT: addi a6, a2, -32 ; RV32XTHEADBB-NEXT: bltz a6, .LBB24_2 ; RV32XTHEADBB-NEXT: # %bb.1: ; RV32XTHEADBB-NEXT: mv a3, a5 @@ -2464,8 +2464,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV32I-LABEL: rotr_64_zext: ; RV32I: # %bb.0: ; RV32I-NEXT: neg a4, a2 -; RV32I-NEXT: addi a6, a2, -32 ; RV32I-NEXT: srl a5, a1, a2 +; RV32I-NEXT: addi a6, a2, -32 ; RV32I-NEXT: bltz a6, .LBB25_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a3, a5 @@ -2512,8 +2512,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV32ZBB-LABEL: rotr_64_zext: ; RV32ZBB: # %bb.0: ; RV32ZBB-NEXT: neg a4, a2 -; RV32ZBB-NEXT: addi a6, a2, -32 ; RV32ZBB-NEXT: srl a5, a1, a2 +; RV32ZBB-NEXT: addi a6, a2, -32 ; RV32ZBB-NEXT: bltz a6, .LBB25_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: mv a3, a5 @@ -2557,8 +2557,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV32XTHEADBB-LABEL: rotr_64_zext: ; RV32XTHEADBB: # %bb.0: ; RV32XTHEADBB-NEXT: neg a4, a2 -; RV32XTHEADBB-NEXT: addi a6, a2, -32 ; RV32XTHEADBB-NEXT: srl a5, a1, a2 +; RV32XTHEADBB-NEXT: addi a6, a2, -32 ; RV32XTHEADBB-NEXT: bltz a6, .LBB25_2 ; RV32XTHEADBB-NEXT: # %bb.1: ; RV32XTHEADBB-NEXT: mv a3, a5 diff --git a/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll b/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll index f14fe2665835e..3f1b2fab8bb10 100644 --- a/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll +++ b/llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll @@ -42,8 +42,8 @@ define i64 @test_Pr_wide_scalar_inout(ptr %0, i64 noundef %1) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: mv a3, a2 -; CHECK-NEXT: sw a0, 12(sp) ; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: sw a0, 12(sp) ; CHECK-NEXT: sw a1, 0(sp) ; CHECK-NEXT: sw a3, 4(sp) ; CHECK-NEXT: #APP @@ -112,8 +112,8 @@ define i64 @test_cR_wide_scalar_inout(ptr %0, i64 noundef %1) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: mv a3, a2 -; CHECK-NEXT: sw a0, 12(sp) ; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: sw a0, 12(sp) ; CHECK-NEXT: sw a1, 0(sp) ; CHECK-NEXT: sw a3, 4(sp) ; CHECK-NEXT: #APP diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 90a8eadb3f974..15cea807a26de 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -762,16 +762,16 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; ; RV32ZBB-LABEL: ctpop_v2i64: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a2, 4(a1) -; RV32ZBB-NEXT: lw a3, 0(a1) -; RV32ZBB-NEXT: lw a4, 12(a1) -; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: cpop a2, a2 +; RV32ZBB-NEXT: lw a2, 0(a1) +; RV32ZBB-NEXT: lw a3, 4(a1) +; RV32ZBB-NEXT: lw a4, 8(a1) +; RV32ZBB-NEXT: lw a1, 12(a1) ; RV32ZBB-NEXT: cpop a3, a3 -; RV32ZBB-NEXT: cpop a4, a4 +; RV32ZBB-NEXT: cpop a2, a2 ; RV32ZBB-NEXT: cpop a1, a1 -; RV32ZBB-NEXT: add a2, a3, a2 -; RV32ZBB-NEXT: add a1, a1, a4 +; RV32ZBB-NEXT: cpop a4, a4 +; RV32ZBB-NEXT: add a2, a2, a3 +; RV32ZBB-NEXT: add a1, a4, a1 ; RV32ZBB-NEXT: sw a2, 0(a0) ; RV32ZBB-NEXT: sw zero, 4(a0) ; RV32ZBB-NEXT: sw a1, 8(a0) @@ -806,18 +806,18 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind { ; ; RV32ZBB-LABEL: ctpop_v2i64_ult_two: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a1, 12(a0) -; RV32ZBB-NEXT: lw a2, 8(a0) -; RV32ZBB-NEXT: lw a3, 4(a0) -; RV32ZBB-NEXT: lw a0, 0(a0) -; RV32ZBB-NEXT: cpop a1, a1 -; RV32ZBB-NEXT: cpop a2, a2 -; RV32ZBB-NEXT: cpop a3, a3 +; RV32ZBB-NEXT: lw a1, 0(a0) +; RV32ZBB-NEXT: lw a2, 4(a0) +; RV32ZBB-NEXT: lw a3, 8(a0) +; RV32ZBB-NEXT: lw a0, 12(a0) ; RV32ZBB-NEXT: cpop a0, a0 -; RV32ZBB-NEXT: add a1, a2, a1 -; RV32ZBB-NEXT: add a0, a0, a3 -; RV32ZBB-NEXT: sltiu a0, a0, 2 -; RV32ZBB-NEXT: sltiu a1, a1, 2 +; RV32ZBB-NEXT: cpop a3, a3 +; RV32ZBB-NEXT: cpop a2, a2 +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: add a3, a3, a0 +; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: sltiu a0, a1, 2 +; RV32ZBB-NEXT: sltiu a1, a3, 2 ; RV32ZBB-NEXT: ret %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) %2 = icmp ult <2 x i64> %1, @@ -849,20 +849,20 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind { ; ; RV32ZBB-LABEL: ctpop_v2i64_ugt_one: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a1, 12(a0) -; RV32ZBB-NEXT: lw a2, 8(a0) -; RV32ZBB-NEXT: lw a3, 4(a0) -; RV32ZBB-NEXT: lw a0, 0(a0) -; RV32ZBB-NEXT: cpop a1, a1 -; RV32ZBB-NEXT: cpop a2, a2 -; RV32ZBB-NEXT: cpop a3, a3 +; RV32ZBB-NEXT: lw a1, 0(a0) +; RV32ZBB-NEXT: lw a2, 4(a0) +; RV32ZBB-NEXT: lw a3, 8(a0) +; RV32ZBB-NEXT: lw a0, 12(a0) ; RV32ZBB-NEXT: cpop a0, a0 -; RV32ZBB-NEXT: add a1, a2, a1 -; RV32ZBB-NEXT: add a0, a0, a3 -; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: cpop a3, a3 +; RV32ZBB-NEXT: cpop a2, a2 +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: add a0, a3, a0 +; RV32ZBB-NEXT: add a1, a1, a2 ; RV32ZBB-NEXT: sltiu a1, a1, 2 -; RV32ZBB-NEXT: xori a0, a0, 1 -; RV32ZBB-NEXT: xori a1, a1, 1 +; RV32ZBB-NEXT: sltiu a2, a0, 2 +; RV32ZBB-NEXT: xori a0, a1, 1 +; RV32ZBB-NEXT: xori a1, a2, 1 ; RV32ZBB-NEXT: ret %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) %2 = icmp ugt <2 x i64> %1, @@ -904,20 +904,20 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind { ; ; RV32ZBB-LABEL: ctpop_v2i64_eq_one: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a1, 12(a0) -; RV32ZBB-NEXT: lw a2, 8(a0) -; RV32ZBB-NEXT: lw a3, 4(a0) -; RV32ZBB-NEXT: lw a0, 0(a0) -; RV32ZBB-NEXT: cpop a1, a1 -; RV32ZBB-NEXT: cpop a2, a2 -; RV32ZBB-NEXT: cpop a3, a3 +; RV32ZBB-NEXT: lw a1, 0(a0) +; RV32ZBB-NEXT: lw a2, 4(a0) +; RV32ZBB-NEXT: lw a3, 8(a0) +; RV32ZBB-NEXT: lw a0, 12(a0) ; RV32ZBB-NEXT: cpop a0, a0 -; RV32ZBB-NEXT: add a1, a2, a1 -; RV32ZBB-NEXT: add a0, a0, a3 -; RV32ZBB-NEXT: addi a0, a0, -1 +; RV32ZBB-NEXT: cpop a3, a3 +; RV32ZBB-NEXT: cpop a2, a2 +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: add a0, a3, a0 +; RV32ZBB-NEXT: add a1, a1, a2 ; RV32ZBB-NEXT: addi a1, a1, -1 -; RV32ZBB-NEXT: seqz a0, a0 -; RV32ZBB-NEXT: seqz a1, a1 +; RV32ZBB-NEXT: addi a2, a0, -1 +; RV32ZBB-NEXT: seqz a0, a1 +; RV32ZBB-NEXT: seqz a1, a2 ; RV32ZBB-NEXT: ret %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) %2 = icmp eq <2 x i64> %1, @@ -961,20 +961,20 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind { ; ; RV32ZBB-LABEL: ctpop_v2i64_ne_one: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a1, 12(a0) -; RV32ZBB-NEXT: lw a2, 8(a0) -; RV32ZBB-NEXT: lw a3, 4(a0) -; RV32ZBB-NEXT: lw a0, 0(a0) -; RV32ZBB-NEXT: cpop a1, a1 -; RV32ZBB-NEXT: cpop a2, a2 -; RV32ZBB-NEXT: cpop a3, a3 +; RV32ZBB-NEXT: lw a1, 0(a0) +; RV32ZBB-NEXT: lw a2, 4(a0) +; RV32ZBB-NEXT: lw a3, 8(a0) +; RV32ZBB-NEXT: lw a0, 12(a0) ; RV32ZBB-NEXT: cpop a0, a0 -; RV32ZBB-NEXT: add a1, a2, a1 -; RV32ZBB-NEXT: add a0, a0, a3 -; RV32ZBB-NEXT: addi a0, a0, -1 +; RV32ZBB-NEXT: cpop a3, a3 +; RV32ZBB-NEXT: cpop a2, a2 +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: add a0, a3, a0 +; RV32ZBB-NEXT: add a1, a1, a2 ; RV32ZBB-NEXT: addi a1, a1, -1 -; RV32ZBB-NEXT: snez a0, a0 -; RV32ZBB-NEXT: snez a1, a1 +; RV32ZBB-NEXT: addi a2, a0, -1 +; RV32ZBB-NEXT: snez a0, a1 +; RV32ZBB-NEXT: snez a1, a2 ; RV32ZBB-NEXT: ret %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) %2 = icmp ne <2 x i64> %1, diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll index 1a3beeb79b85b..17ea0a32cf475 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbs.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll @@ -787,8 +787,8 @@ define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, -1 ; CHECK-NEXT: andi a3, a0, 63 -; CHECK-NEXT: addi a1, a3, -32 ; CHECK-NEXT: sll a0, a2, a0 +; CHECK-NEXT: addi a1, a3, -32 ; CHECK-NEXT: bltz a1, .LBB43_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: sll a2, a2, a3 @@ -815,8 +815,8 @@ define i64 @bset_trailing_ones_i64_no_mask(i64 %a) nounwind { ; CHECK-LABEL: bset_trailing_ones_i64_no_mask: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: addi a2, a0, -32 ; CHECK-NEXT: sll a1, a1, a0 +; CHECK-NEXT: addi a2, a0, -32 ; CHECK-NEXT: bltz a2, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 diff --git a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll index dd49d9e3e2dce..8865f244cee1e 100644 --- a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll +++ b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll @@ -122,9 +122,9 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind { ; RV64ID-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64ID-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64ID-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill +; RV64ID-NEXT: fmv.d fs0, fa0 ; RV64ID-NEXT: lui a0, %hi(.LCPI4_0) ; RV64ID-NEXT: fld fa5, %lo(.LCPI4_0)(a0) -; RV64ID-NEXT: fmv.d fs0, fa0 ; RV64ID-NEXT: fle.d s0, fa5, fa0 ; RV64ID-NEXT: call __fixdfti ; RV64ID-NEXT: li a2, -1 diff --git a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll index ea582ac258b71..a243d9ed68a33 100644 --- a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll +++ b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll @@ -309,14 +309,14 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind { ; RV64IZFH-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64IZFH-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64IZFH-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64IZFH-NEXT: fcvt.s.h fa0, fa0 ; RV64IZFH-NEXT: lui a0, %hi(.LCPI5_0) +; RV64IZFH-NEXT: fmv.w.x fa5, zero +; RV64IZFH-NEXT: fle.s a1, fa5, fa0 ; RV64IZFH-NEXT: flw fa5, %lo(.LCPI5_0)(a0) -; RV64IZFH-NEXT: fcvt.s.h fa0, fa0 -; RV64IZFH-NEXT: fmv.w.x fa4, zero -; RV64IZFH-NEXT: fle.s a0, fa4, fa0 -; RV64IZFH-NEXT: flt.s a1, fa5, fa0 -; RV64IZFH-NEXT: neg s0, a1 -; RV64IZFH-NEXT: neg s1, a0 +; RV64IZFH-NEXT: flt.s a0, fa5, fa0 +; RV64IZFH-NEXT: neg s0, a0 +; RV64IZFH-NEXT: neg s1, a1 ; RV64IZFH-NEXT: call __fixunssfti ; RV64IZFH-NEXT: and a0, s1, a0 ; RV64IZFH-NEXT: and a1, s1, a1 diff --git a/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll b/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll index ac455b7fac882..c1b8d0865dca8 100644 --- a/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll +++ b/llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll @@ -42,8 +42,8 @@ define i128 @test_R_wide_scalar_inout(ptr %0, i128 noundef %1) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi sp, sp, -32 ; CHECK-NEXT: mv a3, a2 -; CHECK-NEXT: sd a0, 24(sp) ; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: sd a0, 24(sp) ; CHECK-NEXT: sd a1, 0(sp) ; CHECK-NEXT: sd a3, 8(sp) ; CHECK-NEXT: #APP @@ -112,8 +112,8 @@ define i128 @test_cR_wide_scalar_inout(ptr %0, i128 noundef %1) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi sp, sp, -32 ; CHECK-NEXT: mv a3, a2 -; CHECK-NEXT: sd a0, 24(sp) ; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: sd a0, 24(sp) ; CHECK-NEXT: sd a1, 0(sp) ; CHECK-NEXT: sd a3, 8(sp) ; CHECK-NEXT: #APP diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll index 1ec4d8ddd1d84..8379036b2d74d 100644 --- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll +++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll @@ -29,8 +29,8 @@ define i64 @test0(i64 %n, ptr %p) nounwind { ; RV64-NEXT: sd a3, 16(sp) ; RV64-NEXT: sd a1, 24(sp) ; RV64-NEXT: addi a1, sp, 24 -; RV64-NEXT: addi a0, sp, 8 ; RV64-NEXT: addi s1, sp, 8 +; RV64-NEXT: addi a0, sp, 8 ; RV64-NEXT: call __clear_cache ; RV64-NEXT: mv a0, s0 ; RV64-NEXT: jalr s1 @@ -60,8 +60,8 @@ define i64 @test0(i64 %n, ptr %p) nounwind { ; RV64-LINUX-NEXT: sd a3, 16(sp) ; RV64-LINUX-NEXT: sd a1, 24(sp) ; RV64-LINUX-NEXT: addi a1, sp, 24 -; RV64-LINUX-NEXT: addi a0, sp, 8 ; RV64-LINUX-NEXT: addi s1, sp, 8 +; RV64-LINUX-NEXT: addi a0, sp, 8 ; RV64-LINUX-NEXT: li a2, 0 ; RV64-LINUX-NEXT: call __riscv_flush_icache ; RV64-LINUX-NEXT: mv a0, s0 diff --git a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll index b8c43289bdfed..dd16e2beacec2 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll @@ -169,9 +169,9 @@ define signext i32 @andi_srliw(i32 signext %0, ptr %1, i32 signext %2) { ; CHECK-LABEL: andi_srliw: ; CHECK: # %bb.0: ; CHECK-NEXT: andi a3, a0, -8 -; CHECK-NEXT: srliw a4, a0, 3 +; CHECK-NEXT: srliw a0, a0, 3 +; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: addw a0, a3, a2 -; CHECK-NEXT: sw a4, 0(a1) ; CHECK-NEXT: ret %4 = and i32 %0, -8 %5 = lshr i32 %0, 3 diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll index 985837d05caa2..b87d3504ce9ff 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll @@ -106,8 +106,8 @@ define i64 @pack_i64_3(ptr %0, ptr %1) { ; RV64I-LABEL: pack_i64_3: ; RV64I: # %bb.0: ; RV64I-NEXT: lw a0, 0(a0) -; RV64I-NEXT: lwu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: lwu a1, 0(a1) ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll index 4ade6c09fe43d..fa6ae2f8b171e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -103,13 +103,13 @@ define <8 x i1> @fv8(ptr %p, i64 %index, i64 %tc) { define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv32: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: lui a0, %hi(.LCPI8_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: vsext.vf8 v16, v8 ; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 @@ -130,11 +130,8 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: lui a0, %hi(.LCPI9_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1) -; CHECK-NEXT: vle8.v v17, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI9_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2) ; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vle8.v v18, (a0) +; CHECK-NEXT: vle8.v v17, (a0) ; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: vsext.vf8 v8, v16 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 @@ -142,13 +139,16 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vsext.vf8 v8, v17 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v17, v8, a2 +; CHECK-NEXT: lui a0, %hi(.LCPI9_2) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v17, 4 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vsext.vf8 v8, v18 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vslideup.vi v0, v16, 6 diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll index 9ac2775d30668..3f4a7fca33293 100644 --- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll +++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll @@ -17,17 +17,17 @@ define void @test(ptr %addr) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb ; CHECK-NEXT: csrrs a1, vlenb, zero ; CHECK-NEXT: vl1re64.v v8, (a0) -; CHECK-NEXT: slli a2, a1, 1 -; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: add a3, a0, a1 ; CHECK-NEXT: vl1re64.v v9, (a3) -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: slli a3, a1, 1 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: vl1re64.v v10, (a0) -; CHECK-NEXT: add a2, a3, a2 -; CHECK-NEXT: vs1r.v v8, (a3) -; CHECK-NEXT: vs1r.v v9, (a2) -; CHECK-NEXT: vs1r.v v10, (a1) +; CHECK-NEXT: add a3, a2, a3 +; CHECK-NEXT: vs1r.v v8, (a2) +; CHECK-NEXT: vs1r.v v10, (a3) +; CHECK-NEXT: vs1r.v v9, (a1) ; CHECK-NEXT: csrrs a0, vlenb, zero ; CHECK-NEXT: slli a1, a0, 1 ; CHECK-NEXT: add a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll index fb25d4e15e40e..5fecb75d847a0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll +++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll @@ -17,14 +17,14 @@ define @test(ptr %addr, i64 %vl) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; CHECK-NEXT: csrrs a2, vlenb, zero ; CHECK-NEXT: vl1re64.v v8, (a0) +; CHECK-NEXT: addi a3, sp, 16 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: vl1re64.v v9, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vs1r.v v8, (a0) +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vs1r.v v8, (a3) ; CHECK-NEXT: vs1r.v v9, (a2) ; CHECK-NEXT: vl1re64.v v8, (a2) -; CHECK-NEXT: vl1re64.v v9, (a0) +; CHECK-NEXT: vl1re64.v v9, (a3) ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma ; CHECK-NEXT: vfadd.vv v8, v9, v8 ; CHECK-NEXT: csrrs a0, vlenb, zero diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll index 1ed84316d4484..d7c608fffd7a3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll @@ -713,59 +713,59 @@ define @bitreverse_nxv1i64( %va) { ; RV32-NEXT: vsetvli a4, zero, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 24 ; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsrl.vi v10, v8, 8 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsrl.vx v10, v8, a1 -; RV32-NEXT: vsrl.vx v11, v8, a2 +; RV32-NEXT: vsrl.vx v11, v8, a1 +; RV32-NEXT: vsrl.vx v12, v8, a2 ; RV32-NEXT: addi a0, a3, -256 -; RV32-NEXT: vsll.vx v12, v8, a1 -; RV32-NEXT: vand.vx v11, v11, a0 -; RV32-NEXT: vlse64.v v13, (a5), zero -; RV32-NEXT: vor.vv v10, v11, v10 -; RV32-NEXT: vand.vx v11, v8, a0 -; RV32-NEXT: vsll.vx v11, v11, a2 -; RV32-NEXT: vor.vv v11, v12, v11 -; RV32-NEXT: vsrl.vi v12, v8, 8 ; RV32-NEXT: vand.vx v9, v9, a4 -; RV32-NEXT: vand.vv v12, v12, v13 -; RV32-NEXT: vor.vv v9, v12, v9 +; RV32-NEXT: vsll.vx v13, v8, a1 +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vor.vv v11, v12, v11 +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsll.vx v12, v12, a2 +; RV32-NEXT: vor.vv v12, v13, v12 +; RV32-NEXT: vlse64.v v13, (a5), zero +; RV32-NEXT: vand.vv v10, v10, v13 +; RV32-NEXT: vor.vv v9, v10, v9 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: vand.vv v12, v8, v13 +; RV32-NEXT: vand.vv v10, v8, v13 ; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: addi a2, a2, 1365 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v9, v9, v10 +; RV32-NEXT: vor.vv v9, v9, v11 ; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vmv.v.x v11, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsll.vi v10, v10, 8 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vor.vv v8, v11, v8 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v11, a2 +; RV32-NEXT: vmv.v.x v12, a2 ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vand.vv v9, v9, v11 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vand.vv v9, v9, v12 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vand.vv v8, v8, v11 -; RV32-NEXT: vand.vv v9, v9, v11 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v9, v9, v12 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -852,42 +852,42 @@ define @bitreverse_nxv2i64( %va) { ; RV32-NEXT: li a2, 40 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: vsetvli a4, zero, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vsrl.vi v10, v8, 24 ; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsrl.vi v14, v8, 8 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsrl.vx v10, v8, a1 -; RV32-NEXT: vsrl.vx v12, v8, a2 +; RV32-NEXT: vsrl.vx v12, v8, a1 +; RV32-NEXT: vsrl.vx v16, v8, a2 ; RV32-NEXT: addi a0, a3, -256 -; RV32-NEXT: vsll.vx v18, v8, a1 -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vlse64.v v14, (a5), zero -; RV32-NEXT: vor.vv v12, v12, v10 -; RV32-NEXT: vand.vx v10, v8, a0 -; RV32-NEXT: vsll.vx v10, v10, a2 -; RV32-NEXT: vor.vv v10, v18, v10 -; RV32-NEXT: vsrl.vi v18, v8, 8 -; RV32-NEXT: vand.vx v16, v16, a4 -; RV32-NEXT: vand.vv v18, v18, v14 -; RV32-NEXT: vor.vv v16, v18, v16 +; RV32-NEXT: vand.vx v18, v10, a4 +; RV32-NEXT: vsll.vx v10, v8, a1 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vor.vv v12, v16, v12 +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsll.vx v16, v16, a2 +; RV32-NEXT: vor.vv v10, v10, v16 +; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vand.vv v14, v14, v16 +; RV32-NEXT: vor.vv v14, v14, v18 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: vand.vv v14, v8, v14 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: addi a2, a2, 1365 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v12, v16, v12 +; RV32-NEXT: vor.vv v12, v14, v12 ; RV32-NEXT: vsetvli a3, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v16, a0 +; RV32-NEXT: vmv.v.x v14, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; RV32-NEXT: vsll.vi v14, v14, 8 -; RV32-NEXT: vor.vv v8, v8, v14 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v14, a1 +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma @@ -895,13 +895,13 @@ define @bitreverse_nxv2i64( %va) { ; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vand.vv v12, v12, v14 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v14 -; RV32-NEXT: vand.vv v12, v12, v14 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 1 @@ -993,42 +993,42 @@ define @bitreverse_nxv4i64( %va) { ; RV32-NEXT: li a2, 40 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: vsetvli a4, zero, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: vsrl.vi v12, v8, 24 ; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsrl.vi v20, v8, 8 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsrl.vx v12, v8, a1 -; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vsrl.vx v24, v8, a2 ; RV32-NEXT: addi a0, a3, -256 -; RV32-NEXT: vsll.vx v28, v8, a1 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vlse64.v v20, (a5), zero -; RV32-NEXT: vor.vv v16, v16, v12 -; RV32-NEXT: vand.vx v12, v8, a0 -; RV32-NEXT: vsll.vx v12, v12, a2 -; RV32-NEXT: vor.vv v12, v28, v12 -; RV32-NEXT: vsrl.vi v28, v8, 8 -; RV32-NEXT: vand.vx v24, v24, a4 -; RV32-NEXT: vand.vv v28, v28, v20 -; RV32-NEXT: vor.vv v24, v28, v24 +; RV32-NEXT: vand.vx v28, v12, a4 +; RV32-NEXT: vsll.vx v12, v8, a1 +; RV32-NEXT: vand.vx v24, v24, a0 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vand.vx v24, v8, a0 +; RV32-NEXT: vsll.vx v24, v24, a2 +; RV32-NEXT: vor.vv v12, v12, v24 +; RV32-NEXT: vlse64.v v24, (a5), zero +; RV32-NEXT: vand.vv v20, v20, v24 +; RV32-NEXT: vor.vv v20, v20, v28 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: vand.vv v20, v8, v20 +; RV32-NEXT: vand.vv v24, v8, v24 ; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: addi a2, a2, 1365 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vor.vv v16, v20, v16 ; RV32-NEXT: vsetvli a3, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v24, a0 +; RV32-NEXT: vmv.v.x v20, a0 ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; RV32-NEXT: vsll.vi v20, v20, 8 -; RV32-NEXT: vor.vv v8, v8, v20 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v20, a1 +; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, ma @@ -1036,13 +1036,13 @@ define @bitreverse_nxv4i64( %va) { ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vand.vv v8, v8, v20 +; RV32-NEXT: vand.vv v16, v16, v20 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v20 -; RV32-NEXT: vand.vv v16, v16, v20 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 @@ -1137,38 +1137,38 @@ define @bitreverse_nxv8i64( %va) { ; RV32-NEXT: li a1, 56 ; RV32-NEXT: li a2, 40 ; RV32-NEXT: lui a3, 16 -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: addi a5, sp, 8 -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsll.vx v0, v8, a1 -; RV32-NEXT: vand.vx v24, v24, a0 +; RV32-NEXT: vand.vx v24, v24, a3 ; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vand.vx v16, v8, a3 ; RV32-NEXT: vsll.vx v16, v16, a2 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v0, (a5), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a4 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a1 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24 -; RV32-NEXT: vand.vv v16, v8, v0 -; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vx v8, v8, a1 ; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v16, v16, 8 ; RV32-NEXT: vor.vv v8, v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll index 4d34621cd5f24..e2c8bc8b29171 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll @@ -1585,58 +1585,58 @@ define @vp_bitreverse_nxv1i64_unmasked( %va ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 24 +; RV32-NEXT: vsrl.vi v10, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsll.vx v10, v8, a2 +; RV32-NEXT: vsll.vx v11, v8, a2 ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vsrl.vx v11, v8, a2 -; RV32-NEXT: vsrl.vx v12, v8, a4 +; RV32-NEXT: vsrl.vx v12, v8, a2 +; RV32-NEXT: vsrl.vx v13, v8, a4 +; RV32-NEXT: vand.vx v9, v9, a5 +; RV32-NEXT: vand.vx v13, v13, a1 +; RV32-NEXT: vor.vv v12, v13, v12 ; RV32-NEXT: vand.vx v13, v8, a1 -; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vor.vv v11, v12, v11 -; RV32-NEXT: vlse64.v v12, (a6), zero ; RV32-NEXT: vsll.vx v13, v13, a4 -; RV32-NEXT: vor.vv v10, v10, v13 -; RV32-NEXT: vsrl.vi v13, v8, 8 -; RV32-NEXT: vand.vx v9, v9, a5 -; RV32-NEXT: vand.vv v13, v13, v12 -; RV32-NEXT: vor.vv v9, v13, v9 +; RV32-NEXT: vor.vv v11, v11, v13 +; RV32-NEXT: vlse64.v v13, (a6), zero +; RV32-NEXT: vand.vv v10, v10, v13 +; RV32-NEXT: vor.vv v9, v10, v9 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: vand.vv v12, v8, v12 +; RV32-NEXT: vand.vv v10, v8, v13 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: addi a2, a2, 819 ; RV32-NEXT: addi a3, a3, 1365 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsll.vi v10, v10, 8 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsetvli a4, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vor.vv v9, v9, v11 +; RV32-NEXT: vor.vv v9, v9, v12 ; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v11, a2 +; RV32-NEXT: vmv.v.x v12, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vor.vv v8, v11, v8 ; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a3 +; RV32-NEXT: vmv.v.x v11, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vand.vv v9, v9, v12 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v11 -; RV32-NEXT: vand.vv v9, v9, v11 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v9, v9, v12 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vand.vv v9, v9, v11 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1866,23 +1866,23 @@ define @vp_bitreverse_nxv2i64_unmasked( %va ; RV32-NEXT: lui a5, 4080 ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v14, v8, 24 +; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: vsrl.vi v14, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsll.vx v12, v8, a2 ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vsrl.vx v10, v8, a2 -; RV32-NEXT: vsrl.vx v16, v8, a4 -; RV32-NEXT: vand.vx v18, v8, a1 -; RV32-NEXT: vand.vx v16, v16, a1 -; RV32-NEXT: vor.vv v10, v16, v10 +; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v18, v8, a4 +; RV32-NEXT: vand.vx v20, v10, a5 +; RV32-NEXT: vand.vx v10, v18, a1 +; RV32-NEXT: vor.vv v10, v10, v16 +; RV32-NEXT: vand.vx v16, v8, a1 +; RV32-NEXT: vsll.vx v16, v16, a4 +; RV32-NEXT: vor.vv v12, v12, v16 ; RV32-NEXT: vlse64.v v16, (a6), zero -; RV32-NEXT: vsll.vx v18, v18, a4 -; RV32-NEXT: vor.vv v12, v12, v18 -; RV32-NEXT: vsrl.vi v18, v8, 8 -; RV32-NEXT: vand.vx v14, v14, a5 -; RV32-NEXT: vand.vv v18, v18, v16 -; RV32-NEXT: vor.vv v14, v18, v14 +; RV32-NEXT: vand.vv v14, v14, v16 +; RV32-NEXT: vor.vv v14, v14, v20 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: lui a3, 349525 @@ -2148,23 +2148,23 @@ define @vp_bitreverse_nxv4i64_unmasked( %va ; RV32-NEXT: lui a5, 4080 ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v20, v8, 24 +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vsrl.vi v20, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsll.vx v16, v8, a2 ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vsrl.vx v12, v8, a2 -; RV32-NEXT: vsrl.vx v24, v8, a4 -; RV32-NEXT: vand.vx v28, v8, a1 -; RV32-NEXT: vand.vx v24, v24, a1 -; RV32-NEXT: vor.vv v12, v24, v12 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vsrl.vx v28, v8, a4 +; RV32-NEXT: vand.vx v4, v12, a5 +; RV32-NEXT: vand.vx v12, v28, a1 +; RV32-NEXT: vor.vv v12, v12, v24 +; RV32-NEXT: vand.vx v24, v8, a1 +; RV32-NEXT: vsll.vx v24, v24, a4 +; RV32-NEXT: vor.vv v16, v16, v24 ; RV32-NEXT: vlse64.v v24, (a6), zero -; RV32-NEXT: vsll.vx v28, v28, a4 -; RV32-NEXT: vor.vv v16, v16, v28 -; RV32-NEXT: vsrl.vi v28, v8, 8 -; RV32-NEXT: vand.vx v20, v20, a5 -; RV32-NEXT: vand.vv v28, v28, v24 -; RV32-NEXT: vor.vv v20, v28, v20 +; RV32-NEXT: vand.vv v20, v20, v24 +; RV32-NEXT: vor.vv v20, v20, v4 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: lui a3, 349525 @@ -2288,66 +2288,68 @@ define @vp_bitreverse_nxv7i64( %va, @vp_bitreverse_nxv7i64_unmasked( %va ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v16, v8, a2 -; RV32-NEXT: addi a1, a3, -256 ; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vand.vx v0, v0, a3 ; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v24, v8, a1 +; RV32-NEXT: addi a6, sp, 16 +; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v24, v8, a3 ; RV32-NEXT: vsll.vx v24, v24, a4 ; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v24, (a6), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v24, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v0, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v24, v8, v16 ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v0, v8 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: lui a3, 349525 @@ -2673,66 +2675,68 @@ define @vp_bitreverse_nxv8i64( %va, @vp_bitreverse_nxv8i64_unmasked( %va ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v16, v8, a2 -; RV32-NEXT: addi a1, a3, -256 ; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vand.vx v0, v0, a3 ; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v24, v8, a1 +; RV32-NEXT: addi a6, sp, 16 +; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v24, v8, a3 ; RV32-NEXT: vsll.vx v24, v24, a4 ; RV32-NEXT: vor.vv v16, v16, v24 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v24, (a6), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v24, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v0, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v24, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v24, v8, v16 ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v0, v8 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: lui a3, 349525 diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll index 2cd763afa36b7..ee8bfe8910b78 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll @@ -265,30 +265,30 @@ define @bswap_nxv1i64( %va) { ; RV32-NEXT: vsetvli a4, zero, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 24 ; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsrl.vi v10, v8, 8 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsrl.vx v10, v8, a1 -; RV32-NEXT: vsrl.vx v11, v8, a2 +; RV32-NEXT: vsrl.vx v11, v8, a1 +; RV32-NEXT: vsrl.vx v12, v8, a2 ; RV32-NEXT: addi a0, a3, -256 -; RV32-NEXT: vsll.vx v12, v8, a1 -; RV32-NEXT: vand.vx v11, v11, a0 -; RV32-NEXT: vlse64.v v13, (a5), zero -; RV32-NEXT: vor.vv v10, v11, v10 -; RV32-NEXT: vand.vx v11, v8, a0 -; RV32-NEXT: vsll.vx v11, v11, a2 -; RV32-NEXT: vor.vv v11, v12, v11 -; RV32-NEXT: vsrl.vi v12, v8, 8 ; RV32-NEXT: vand.vx v9, v9, a4 -; RV32-NEXT: vand.vv v12, v12, v13 -; RV32-NEXT: vor.vv v9, v12, v9 -; RV32-NEXT: vand.vv v12, v8, v13 +; RV32-NEXT: vsll.vx v13, v8, a1 +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vor.vv v11, v12, v11 +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsll.vx v12, v12, a2 +; RV32-NEXT: vor.vv v12, v13, v12 +; RV32-NEXT: vlse64.v v13, (a5), zero +; RV32-NEXT: vand.vv v10, v10, v13 +; RV32-NEXT: vor.vv v9, v10, v9 +; RV32-NEXT: vand.vv v10, v8, v13 ; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vor.vv v9, v9, v10 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v11, v8 +; RV32-NEXT: vsll.vi v10, v10, 8 +; RV32-NEXT: vor.vv v9, v9, v11 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -349,30 +349,30 @@ define @bswap_nxv2i64( %va) { ; RV32-NEXT: vsetvli a4, zero, e64, m2, ta, ma ; RV32-NEXT: vsrl.vi v10, v8, 24 ; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsrl.vi v12, v8, 8 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsrl.vx v12, v8, a1 -; RV32-NEXT: vsrl.vx v14, v8, a2 +; RV32-NEXT: vsrl.vx v14, v8, a1 +; RV32-NEXT: vsrl.vx v16, v8, a2 ; RV32-NEXT: addi a0, a3, -256 -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: vand.vx v14, v14, a0 -; RV32-NEXT: vlse64.v v18, (a5), zero -; RV32-NEXT: vor.vv v12, v14, v12 -; RV32-NEXT: vand.vx v14, v8, a0 -; RV32-NEXT: vsll.vx v14, v14, a2 -; RV32-NEXT: vor.vv v14, v16, v14 -; RV32-NEXT: vsrl.vi v16, v8, 8 ; RV32-NEXT: vand.vx v10, v10, a4 -; RV32-NEXT: vand.vv v16, v16, v18 -; RV32-NEXT: vor.vv v10, v16, v10 -; RV32-NEXT: vand.vv v16, v8, v18 +; RV32-NEXT: vsll.vx v18, v8, a1 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vor.vv v14, v16, v14 +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsll.vx v16, v16, a2 +; RV32-NEXT: vor.vv v16, v18, v16 +; RV32-NEXT: vlse64.v v18, (a5), zero +; RV32-NEXT: vand.vv v12, v12, v18 +; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vand.vv v12, v8, v18 ; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vor.vv v10, v10, v12 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v14, v8 +; RV32-NEXT: vsll.vi v12, v12, 8 +; RV32-NEXT: vor.vv v10, v10, v14 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -431,32 +431,32 @@ define @bswap_nxv4i64( %va) { ; RV32-NEXT: li a2, 40 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: vsetvli a4, zero, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vsrl.vi v16, v8, 24 ; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vsrl.vi v12, v8, 8 ; RV32-NEXT: addi a5, sp, 8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: vsrl.vx v20, v8, a2 +; RV32-NEXT: vsrl.vx v20, v8, a1 +; RV32-NEXT: vsrl.vx v24, v8, a2 ; RV32-NEXT: addi a0, a3, -256 -; RV32-NEXT: vsll.vx v24, v8, a1 -; RV32-NEXT: vand.vx v20, v20, a0 -; RV32-NEXT: vlse64.v v28, (a5), zero -; RV32-NEXT: vor.vv v16, v20, v16 -; RV32-NEXT: vand.vx v20, v8, a0 -; RV32-NEXT: vsll.vx v20, v20, a2 +; RV32-NEXT: vand.vx v16, v16, a4 +; RV32-NEXT: vsll.vx v28, v8, a1 +; RV32-NEXT: vand.vx v24, v24, a0 ; RV32-NEXT: vor.vv v20, v24, v20 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vx v12, v12, a4 -; RV32-NEXT: vand.vv v24, v24, v28 -; RV32-NEXT: vor.vv v12, v24, v12 -; RV32-NEXT: vand.vv v24, v8, v28 +; RV32-NEXT: vand.vx v24, v8, a0 +; RV32-NEXT: vsll.vx v24, v24, a2 +; RV32-NEXT: vor.vv v24, v28, v24 +; RV32-NEXT: vlse64.v v28, (a5), zero +; RV32-NEXT: vand.vv v12, v12, v28 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vand.vv v16, v8, v28 ; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v12, v12, v16 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v20, v8 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v12, v12, v20 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -518,38 +518,38 @@ define @bswap_nxv8i64( %va) { ; RV32-NEXT: li a1, 56 ; RV32-NEXT: li a2, 40 ; RV32-NEXT: lui a3, 16 -; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: addi a5, sp, 8 -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: addi a0, a3, -256 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsll.vx v0, v8, a1 -; RV32-NEXT: vand.vx v24, v24, a0 +; RV32-NEXT: vand.vx v24, v24, a3 ; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vand.vx v16, v8, a3 ; RV32-NEXT: vsll.vx v16, v16, a2 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v0, (a5), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a4 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a1 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24 -; RV32-NEXT: vand.vv v16, v8, v0 -; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vx v8, v8, a1 ; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v16, v16, 8 ; RV32-NEXT: vor.vv v8, v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll index 0c58cca0f9472..8243e103a9271 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll @@ -604,29 +604,29 @@ define @vp_bswap_nxv1i64_unmasked( %va, i32 ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 24 +; RV32-NEXT: vsrl.vi v10, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsll.vx v10, v8, a2 +; RV32-NEXT: vsll.vx v11, v8, a2 ; RV32-NEXT: addi a0, a3, -256 -; RV32-NEXT: vsrl.vx v11, v8, a2 -; RV32-NEXT: vsrl.vx v12, v8, a4 +; RV32-NEXT: vsrl.vx v12, v8, a2 +; RV32-NEXT: vsrl.vx v13, v8, a4 +; RV32-NEXT: vand.vx v9, v9, a5 +; RV32-NEXT: vand.vx v13, v13, a0 +; RV32-NEXT: vor.vv v12, v13, v12 ; RV32-NEXT: vand.vx v13, v8, a0 -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vor.vv v11, v12, v11 -; RV32-NEXT: vlse64.v v12, (a6), zero ; RV32-NEXT: vsll.vx v13, v13, a4 -; RV32-NEXT: vor.vv v10, v10, v13 -; RV32-NEXT: vsrl.vi v13, v8, 8 -; RV32-NEXT: vand.vx v9, v9, a5 -; RV32-NEXT: vand.vv v13, v13, v12 -; RV32-NEXT: vor.vv v9, v13, v9 -; RV32-NEXT: vand.vv v12, v8, v12 +; RV32-NEXT: vor.vv v11, v11, v13 +; RV32-NEXT: vlse64.v v13, (a6), zero +; RV32-NEXT: vand.vv v10, v10, v13 +; RV32-NEXT: vor.vv v9, v10, v9 +; RV32-NEXT: vand.vv v10, v8, v13 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vor.vv v9, v9, v11 +; RV32-NEXT: vsll.vi v10, v10, 8 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v11, v8 +; RV32-NEXT: vor.vv v9, v9, v12 ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -772,29 +772,29 @@ define @vp_bswap_nxv2i64_unmasked( %va, i32 ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: vsrl.vi v12, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsll.vx v12, v8, a2 +; RV32-NEXT: vsll.vx v14, v8, a2 ; RV32-NEXT: addi a0, a3, -256 -; RV32-NEXT: vsrl.vx v14, v8, a2 -; RV32-NEXT: vsrl.vx v16, v8, a4 +; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v18, v8, a4 +; RV32-NEXT: vand.vx v10, v10, a5 +; RV32-NEXT: vand.vx v18, v18, a0 +; RV32-NEXT: vor.vv v16, v18, v16 ; RV32-NEXT: vand.vx v18, v8, a0 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vor.vv v14, v16, v14 -; RV32-NEXT: vlse64.v v16, (a6), zero ; RV32-NEXT: vsll.vx v18, v18, a4 -; RV32-NEXT: vor.vv v12, v12, v18 -; RV32-NEXT: vsrl.vi v18, v8, 8 -; RV32-NEXT: vand.vx v10, v10, a5 -; RV32-NEXT: vand.vv v18, v18, v16 -; RV32-NEXT: vor.vv v10, v18, v10 -; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vor.vv v14, v14, v18 +; RV32-NEXT: vlse64.v v18, (a6), zero +; RV32-NEXT: vand.vv v12, v12, v18 +; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vand.vv v12, v8, v18 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vor.vv v10, v10, v14 +; RV32-NEXT: vsll.vi v12, v12, 8 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v14, v8 +; RV32-NEXT: vor.vv v10, v10, v16 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -940,29 +940,29 @@ define @vp_bswap_nxv4i64_unmasked( %va, i32 ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vsrl.vi v16, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsll.vx v16, v8, a2 +; RV32-NEXT: vsll.vx v20, v8, a2 ; RV32-NEXT: addi a0, a3, -256 -; RV32-NEXT: vsrl.vx v20, v8, a2 -; RV32-NEXT: vsrl.vx v24, v8, a4 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vsrl.vx v28, v8, a4 +; RV32-NEXT: vand.vx v12, v12, a5 +; RV32-NEXT: vand.vx v28, v28, a0 +; RV32-NEXT: vor.vv v24, v28, v24 ; RV32-NEXT: vand.vx v28, v8, a0 -; RV32-NEXT: vand.vx v24, v24, a0 -; RV32-NEXT: vor.vv v20, v24, v20 -; RV32-NEXT: vlse64.v v24, (a6), zero ; RV32-NEXT: vsll.vx v28, v28, a4 -; RV32-NEXT: vor.vv v16, v16, v28 -; RV32-NEXT: vsrl.vi v28, v8, 8 -; RV32-NEXT: vand.vx v12, v12, a5 -; RV32-NEXT: vand.vv v28, v28, v24 -; RV32-NEXT: vor.vv v12, v28, v12 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vor.vv v20, v20, v28 +; RV32-NEXT: vlse64.v v28, (a6), zero +; RV32-NEXT: vand.vv v16, v16, v28 +; RV32-NEXT: vor.vv v12, v16, v12 +; RV32-NEXT: vand.vv v16, v8, v28 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v12, v12, v20 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v20, v8 +; RV32-NEXT: vor.vv v12, v12, v24 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1022,59 +1022,61 @@ define @vp_bswap_nxv7i64( %va, @vp_bswap_nxv7i64_unmasked( %va, i32 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v24, v8, a2 -; RV32-NEXT: addi a0, a3, -256 ; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: addi a2, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a0 +; RV32-NEXT: vand.vx v0, v0, a2 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a2 ; RV32-NEXT: vsll.vx v0, v0, a4 ; RV32-NEXT: vor.vv v16, v24, v0 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v0, (a6), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a5 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -1292,59 +1295,61 @@ define @vp_bswap_nxv8i64( %va, @vp_bswap_nxv8i64_unmasked( %va, i32 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v24, v8, a2 -; RV32-NEXT: addi a0, a3, -256 ; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: addi a2, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a0 +; RV32-NEXT: vand.vx v0, v0, a2 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a2 ; RV32-NEXT: vsll.vx v0, v0, a4 ; RV32-NEXT: vor.vv v16, v24, v0 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v0, (a6), zero -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a5 ; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll index 15f6ca600cb37..b95bc73936059 100644 --- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll @@ -75,17 +75,17 @@ define fastcc @ret_split_nxv64i32(ptr %x) { ; CHECK-NEXT: slli a4, a2, 5 ; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub a4, a4, a3 -; CHECK-NEXT: add a5, a1, a2 -; CHECK-NEXT: vl8re32.v v16, (a5) ; CHECK-NEXT: add a5, a1, a3 +; CHECK-NEXT: vl8re32.v v16, (a5) +; CHECK-NEXT: add a5, a1, a2 ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: add a1, a1, a4 ; CHECK-NEXT: vl8re32.v v24, (a5) ; CHECK-NEXT: vl8re32.v v0, (a1) ; CHECK-NEXT: vs8r.v v8, (a0) -; CHECK-NEXT: vs8r.v v16, (a2) -; CHECK-NEXT: vs8r.v v24, (a3) +; CHECK-NEXT: vs8r.v v24, (a2) +; CHECK-NEXT: vs8r.v v16, (a3) ; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: vs8r.v v0, (a0) ; CHECK-NEXT: ret @@ -245,59 +245,21 @@ define fastcc @ret_nxv32i1_param_nxv32i1_nxv32i1( @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32( %x, %y, %z, i32 %w) { ; CHECK-LABEL: ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vl8re32.v v8, (a2) -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; CHECK-NEXT: vl8re32.v v0, (a0) +; CHECK-NEXT: vl8re32.v v24, (a0) +; CHECK-NEXT: vsetvli a3, zero, e32, m8, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: vl8re32.v v24, (a2) ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a2, a2, a1 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl8re32.v v8, (a0) -; CHECK-NEXT: vl8re32.v v16, (a2) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; CHECK-NEXT: vadd.vv v0, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v24, v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v8, v0, v8 -; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: vadd.vx v16, v8, a4 -; CHECK-NEXT: vadd.vx v8, v24, a4 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: vl8re32.v v24, (a0) +; CHECK-NEXT: vadd.vv v16, v16, v24 +; CHECK-NEXT: vl8re32.v v24, (a2) +; CHECK-NEXT: vadd.vv v16, v16, v24 +; CHECK-NEXT: vadd.vx v16, v16, a4 +; CHECK-NEXT: vadd.vx v8, v8, a4 ; CHECK-NEXT: ret %r = add %x, %y %s = add %r, %z @@ -325,19 +287,19 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_i32( @ret_nxv32i32_call_nxv32i32_nxv32i32_i32( @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV32-NEXT: andi sp, sp, -128 ; RV32-NEXT: addi a1, sp, 128 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv8r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: vl8re32.v v16, (a2) +; RV32-NEXT: vl8re32.v v8, (a2) ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 128 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a2, a2, a1 ; RV32-NEXT: add a3, a0, a1 -; RV32-NEXT: vl8re32.v v0, (a2) -; RV32-NEXT: vl8re32.v v24, (a3) -; RV32-NEXT: vl8re32.v v16, (a0) +; RV32-NEXT: vl8re32.v v24, (a2) +; RV32-NEXT: vl8re32.v v16, (a3) +; RV32-NEXT: vl8re32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vs8r.v v8, (a0) -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 128 -; RV32-NEXT: vs8r.v v16, (a3) +; RV32-NEXT: vs8r.v v0, (a0) +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 128 +; RV32-NEXT: vs8r.v v8, (a2) ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: addi a2, sp, 128 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: addi a3, sp, 128 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: vs8r.v v16, (a1) ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 @@ -445,16 +411,13 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 128 -; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: li a5, 42 -; RV32-NEXT: vs8r.v v24, (a1) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 128 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv8r.v v16, v0 +; RV32-NEXT: vmv8r.v v16, v24 ; RV32-NEXT: call ext3 ; RV32-NEXT: addi sp, s0, -144 ; RV32-NEXT: .cfi_def_cfa sp, 144 @@ -483,33 +446,37 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: addi a1, sp, 128 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv8r.v v0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: vl8re32.v v16, (a2) +; RV64-NEXT: vl8re32.v v8, (a2) ; RV64-NEXT: csrr a3, vlenb ; RV64-NEXT: slli a3, a3, 3 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 128 -; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a2, a2, a1 ; RV64-NEXT: add a3, a0, a1 -; RV64-NEXT: vl8re32.v v0, (a2) -; RV64-NEXT: vl8re32.v v24, (a3) -; RV64-NEXT: vl8re32.v v16, (a0) +; RV64-NEXT: vl8re32.v v24, (a2) +; RV64-NEXT: vl8re32.v v16, (a3) +; RV64-NEXT: vl8re32.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 5 -; RV64-NEXT: add a3, sp, a3 -; RV64-NEXT: addi a3, a3, 128 -; RV64-NEXT: vs8r.v v16, (a3) +; RV64-NEXT: vs8r.v v0, (a0) +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 5 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 128 +; RV64-NEXT: vs8r.v v8, (a2) ; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: addi a2, sp, 128 -; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: addi a3, sp, 128 +; RV64-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: vs8r.v v16, (a1) ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 5 ; RV64-NEXT: add a0, sp, a0 @@ -518,16 +485,13 @@ define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_ ; RV64-NEXT: slli a2, a2, 4 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 128 -; RV64-NEXT: add a1, a3, a1 ; RV64-NEXT: li a5, 42 -; RV64-NEXT: vs8r.v v24, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 128 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv8r.v v16, v0 +; RV64-NEXT: vmv8r.v v16, v24 ; RV64-NEXT: call ext3 ; RV64-NEXT: addi sp, s0, -144 ; RV64-NEXT: .cfi_def_cfa sp, 144 @@ -551,11 +515,11 @@ define fastcc @vector_arg_indirect_stack(i32 %0, i32 %1, i32 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, t5, a0 -; CHECK-NEXT: vl8re32.v v24, (t5) -; CHECK-NEXT: vl8re32.v v0, (a0) +; CHECK-NEXT: vl8re32.v v24, (a0) +; CHECK-NEXT: vl8re32.v v0, (t5) ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v24 -; CHECK-NEXT: vadd.vv v16, v16, v0 +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v16, v24 ; CHECK-NEXT: ret %s = add %x, %z ret %s @@ -608,8 +572,8 @@ define fastcc @pass_vector_arg_indirect_stack( @pass_vector_arg_indirect_stack( @pass_vector_arg_indirect_stack_no_gpr( @pass_vector_arg_indirect_stack_no_gpr( @callee_scalable_vector_split_indirect( %x, %y ret %a @@ -41,9 +41,9 @@ define @caller_scalable_vector_split_indirect( @caller_scalable_vector_split_indirect( @vp_ceil_vv_nxv1bf16( %va, @vp_ceil_vv_nxv1bf16_unmasked( ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -76,12 +76,12 @@ define @vp_ceil_vv_nxv2bf16( %va, @vp_ceil_vv_nxv2bf16_unmasked( ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -130,12 +130,12 @@ define @vp_ceil_vv_nxv4bf16( %va, @vp_ceil_vv_nxv4bf16_unmasked( ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -184,12 +184,12 @@ define @vp_ceil_vv_nxv8bf16( %va, @vp_ceil_vv_nxv8bf16_unmasked( ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -238,12 +238,12 @@ define @vp_ceil_vv_nxv16bf16( %va, ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v8, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t @@ -265,11 +265,11 @@ define @vp_ceil_vv_nxv16bf16_unmasked( @vp_ceil_vv_nxv32bf16( %va, ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: lui a3, 307200 +; CHECK-NEXT: fsrmi a4, 3 ; CHECK-NEXT: slli a1, a2, 1 ; CHECK-NEXT: srli a2, a2, 2 ; CHECK-NEXT: fmv.w.x fa5, a3 @@ -315,11 +316,10 @@ define @vp_ceil_vv_nxv32bf16( %va, ; CHECK-NEXT: vfabs.v v8, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v18, v8, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 3 ; CHECK-NEXT: vmv1r.v v0, v18 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t -; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: fsrm a4 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t @@ -336,11 +336,11 @@ define @vp_ceil_vv_nxv32bf16( %va, ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t ; CHECK-NEXT: vmv1r.v v8, v7 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t @@ -375,11 +375,12 @@ define @vp_ceil_vv_nxv32bf16_unmasked( @vp_ceil_vv_nxv32bf16_unmasked( @vp_ceil_vv_nxv32bf16_unmasked( @llvm.vp.ceil.nxv1f16(, @vp_ceil_vv_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a0) +; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -461,12 +461,12 @@ define @vp_ceil_vv_nxv1f16( %va, @vp_ceil_vv_nxv1f16( %va, @vp_ceil_vv_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -503,11 +503,11 @@ define @vp_ceil_vv_nxv1f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -525,13 +525,13 @@ declare @llvm.vp.ceil.nxv2f16(, @vp_ceil_vv_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a0) +; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -547,12 +547,12 @@ define @vp_ceil_vv_nxv2f16( %va, @vp_ceil_vv_nxv2f16( %va, @vp_ceil_vv_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -589,11 +589,11 @@ define @vp_ceil_vv_nxv2f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -611,13 +611,13 @@ declare @llvm.vp.ceil.nxv4f16(, @vp_ceil_vv_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a0) +; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -633,12 +633,12 @@ define @vp_ceil_vv_nxv4f16( %va, @vp_ceil_vv_nxv4f16( %va, @vp_ceil_vv_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -675,11 +675,11 @@ define @vp_ceil_vv_nxv4f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -699,12 +699,12 @@ define @vp_ceil_vv_nxv8f16( %va, @vp_ceil_vv_nxv8f16( %va, @vp_ceil_vv_nxv8f16( %va, @vp_ceil_vv_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -763,11 +763,11 @@ define @vp_ceil_vv_nxv8f16_unmasked( %va, ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -787,12 +787,12 @@ define @vp_ceil_vv_nxv16f16( %va, @vp_ceil_vv_nxv16f16( %va, @vp_ceil_vv_nxv16f16( %va, @vp_ceil_vv_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -851,11 +851,11 @@ define @vp_ceil_vv_nxv16f16_unmasked( % ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -875,12 +875,12 @@ define @vp_ceil_vv_nxv32f16( %va, @vp_ceil_vv_nxv32f16( %va, @vp_ceil_vv_nxv32f16( %va, @vp_ceil_vv_nxv32f16( %va, @vp_ceil_vv_nxv32f16( %va, @vp_ceil_vv_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -995,11 +995,12 @@ define @vp_ceil_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; ZVFHMIN-NEXT: vmset.m v16 ; ZVFHMIN-NEXT: lui a3, 307200 +; ZVFHMIN-NEXT: fsrmi a4, 3 ; ZVFHMIN-NEXT: slli a1, a2, 1 ; ZVFHMIN-NEXT: srli a2, a2, 2 ; ZVFHMIN-NEXT: fmv.w.x fa5, a3 ; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 ; ZVFHMIN-NEXT: sltu a2, a0, a3 ; ZVFHMIN-NEXT: vmv1r.v v17, v16 @@ -1014,11 +1015,10 @@ define @vp_ceil_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v17, v8, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a2, 3 ; ZVFHMIN-NEXT: vmv1r.v v0, v17 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t -; ZVFHMIN-NEXT: fsrm a2 +; ZVFHMIN-NEXT: fsrm a4 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t @@ -1033,10 +1033,10 @@ define @vp_ceil_vv_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -1064,9 +1064,9 @@ define @vp_ceil_vv_nxv1f32( %va, @vp_ceil_vv_nxv1f32_unmasked( %v ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1106,9 +1106,9 @@ define @vp_ceil_vv_nxv2f32( %va, @vp_ceil_vv_nxv2f32_unmasked( %v ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1149,9 +1149,9 @@ define @vp_ceil_vv_nxv4f32( %va, @vp_ceil_vv_nxv4f32_unmasked( %v ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -1193,9 +1193,9 @@ define @vp_ceil_vv_nxv8f32( %va, @vp_ceil_vv_nxv8f32_unmasked( %v ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1237,9 +1237,9 @@ define @vp_ceil_vv_nxv16f32( %va, @vp_ceil_vv_nxv16f32_unmasked( ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1276,13 +1276,13 @@ declare @llvm.vp.ceil.nxv1f64(, @vp_ceil_vv_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -1297,12 +1297,12 @@ define @vp_ceil_vv_nxv1f64( %va, @vp_ceil_vv_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1320,12 +1320,12 @@ define @vp_ceil_vv_nxv2f64( %va, @vp_ceil_vv_nxv2f64( %va, @vp_ceil_vv_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -1364,12 +1364,12 @@ define @vp_ceil_vv_nxv4f64( %va, @vp_ceil_vv_nxv4f64( %va, @vp_ceil_vv_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1408,12 +1408,12 @@ define @vp_ceil_vv_nxv7f64( %va, @vp_ceil_vv_nxv7f64( %va, @vp_ceil_vv_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1452,12 +1452,12 @@ define @vp_ceil_vv_nxv8f64( %va, @vp_ceil_vv_nxv8f64( %va, @vp_ceil_vv_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_vv_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1498,59 +1498,66 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI44_0) ; CHECK-NEXT: srli a3, a1, 3 ; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 +; CHECK-NEXT: vslidedown.vx v25, v0, a3 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: fsrmi a3, 3 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 3 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a3 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -1570,12 +1577,12 @@ define @vp_ceil_vv_nxv16f64_unmasked( @vp_ceil_vv_nxv16f64_unmasked(, ptr %a %v2 = load <2 x i8>, ptr %b @@ -68,13 +68,13 @@ define void @v4xi8_concat_vector_insert_idx3(ptr %a, ptr %b, i8 %x) { ; CHECK-LABEL: v4xi8_concat_vector_insert_idx3: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) ; CHECK-NEXT: vmv.s.x v10, a2 -; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vslideup.vi v9, v10, 1 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v9, v8, 2 -; CHECK-NEXT: vse8.v v9, (a0) +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret %v1 = load <2 x i8>, ptr %a %v2 = load <2 x i8>, ptr %b @@ -156,26 +156,26 @@ define void @v4xi64_concat_vector_insert_idx2(ptr %a, ptr %b, i64 %x) { ; RV32-LABEL: v4xi64_concat_vector_insert_idx2: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle64.v v8, (a1) -; RV32-NEXT: vle64.v v10, (a0) +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vle64.v v10, (a1) ; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: vslide1down.vx v10, v10, a3 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vslideup.vi v10, v8, 2 -; RV32-NEXT: vse64.v v10, (a0) +; RV32-NEXT: vslideup.vi v8, v10, 2 +; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: v4xi64_concat_vector_insert_idx2: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vle64.v v8, (a1) -; RV64-NEXT: vle64.v v10, (a0) +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vle64.v v10, (a1) ; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma -; RV64-NEXT: vmv.s.x v8, a2 +; RV64-NEXT: vmv.s.x v10, a2 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vslideup.vi v10, v8, 2 -; RV64-NEXT: vse64.v v10, (a0) +; RV64-NEXT: vslideup.vi v8, v10, 2 +; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret %v1 = load <2 x i64>, ptr %a %v2 = load <2 x i64>, ptr %b @@ -189,28 +189,28 @@ define void @v4xi64_concat_vector_insert_idx3(ptr %a, ptr %b, i64 %x) { ; RV32-LABEL: v4xi64_concat_vector_insert_idx3: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vle64.v v8, (a1) -; RV32-NEXT: vle64.v v10, (a0) +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vle64.v v10, (a1) ; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV32-NEXT: vslide1down.vx v9, v8, a2 ; RV32-NEXT: vslide1down.vx v9, v9, a3 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vslideup.vi v8, v9, 1 +; RV32-NEXT: vslideup.vi v10, v9, 1 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vslideup.vi v10, v8, 2 -; RV32-NEXT: vse64.v v10, (a0) +; RV32-NEXT: vslideup.vi v8, v10, 2 +; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: v4xi64_concat_vector_insert_idx3: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vle64.v v8, (a1) -; RV64-NEXT: vle64.v v10, (a0) +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vle64.v v10, (a1) ; RV64-NEXT: vmv.s.x v9, a2 -; RV64-NEXT: vslideup.vi v8, v9, 1 +; RV64-NEXT: vslideup.vi v10, v9, 1 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vslideup.vi v10, v8, 2 -; RV64-NEXT: vse64.v v10, (a0) +; RV64-NEXT: vslideup.vi v8, v10, 2 +; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret %v1 = load <2 x i64>, ptr %a %v2 = load <2 x i64>, ptr %b diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll index f6c26bbba89fe..d470b8b9bff18 100644 --- a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll @@ -31,13 +31,12 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan ; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v11, 10 +; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; RV32-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32-NEXT: vrgather.vi v10, v9, 0 -; RV32-NEXT: vmsne.vi v0, v10, 0 -; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmerge.vim v8, v9, 1, v0 +; RV32-NEXT: vrgather.vi v9, v8, 0 +; RV32-NEXT: vmsne.vi v0, v9, 0 ; RV32-NEXT: vse32.v v11, (a0), v0.t ; RV32-NEXT: ret ; @@ -56,13 +55,13 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan ; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64-NEXT: vmv.v.i v10, 10 -; RV64-NEXT: vmv1r.v v0, v12 -; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; RV64-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64-NEXT: vrgather.vi v11, v9, 0 -; RV64-NEXT: vmsne.vi v0, v11, 0 ; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV64-NEXT: vmerge.vim v8, v9, 1, v0 +; RV64-NEXT: vrgather.vi v9, v8, 0 +; RV64-NEXT: vmsne.vi v0, v9, 0 ; RV64-NEXT: vse32.v v10, (a0), v0.t ; RV64-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir index a9da6c305aac3..ce78f5a367d01 100644 --- a/llvm/test/CodeGen/RISCV/rvv/copyprop.mir +++ b/llvm/test/CodeGen/RISCV/rvv/copyprop.mir @@ -6,9 +6,9 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma + ; CHECK-NEXT: vsll.vi v9, v8, 5 ; CHECK-NEXT: vmsne.vi v0, v8, 0 - ; CHECK-NEXT: vsll.vi v8, v8, 5 - ; CHECK-NEXT: vmerge.vim v8, v8, -1, v0 + ; CHECK-NEXT: vmerge.vim v8, v9, -1, v0 ; CHECK-NEXT: sf.vc.v.x 3, 31, v9, a1 ; CHECK-NEXT: bgeu a0, zero, .LBB0_3 ; CHECK-NEXT: # %bb.1: # %entry @@ -45,8 +45,8 @@ body: | %3:vr = COPY $v8 %17:vr = PseudoVSLL_VI_M1 undef $noreg, %3, 5, 1, 6 /* e64 */, 0 %22:vr = PseudoVMSNE_VI_M1 %3, 0, 1, 6 /* e64 */ - $v0 = COPY %22 - %25:vrnov0 = PseudoVMERGE_VIM_M1 undef $noreg, %17, -1, $v0, 1, 6 /* e64 */ + %23:vmv0 = COPY %22 + %25:vrnov0 = PseudoVMERGE_VIM_M1 undef $noreg, %17, -1, %23, 1, 6 /* e64 */ %29:vr = PseudoVC_V_X_SE_M1 3, 31, %2, 1, 6 /* e64 */, implicit-def dead $sf_vcix_state, implicit $sf_vcix_state %30:vr = PseudoVMV_V_I_M1 undef $noreg, 0, 1, 6 /* e64 */, 0 BGEU %1, $x0, %bb.2 diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll index 208735b18cbab..024e976d8880c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll @@ -809,12 +809,12 @@ define @ctlz_nxv1i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: li a1, 158 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vrsub.vx v8, v8, a1 -; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vminu.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 158 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: li a0, 32 +; CHECK-F-NEXT: vminu.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv1i32: @@ -881,12 +881,12 @@ define @ctlz_nxv2i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: li a1, 158 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vrsub.vx v8, v8, a1 -; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vminu.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 158 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: li a0, 32 +; CHECK-F-NEXT: vminu.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv2i32: @@ -953,12 +953,12 @@ define @ctlz_nxv4i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: li a1, 158 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vrsub.vx v8, v8, a1 -; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vminu.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 158 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: li a0, 32 +; CHECK-F-NEXT: vminu.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv4i32: @@ -1025,12 +1025,12 @@ define @ctlz_nxv8i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: li a1, 158 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vrsub.vx v8, v8, a1 -; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vminu.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 158 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: li a0, 32 +; CHECK-F-NEXT: vminu.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv8i32: @@ -1097,12 +1097,12 @@ define @ctlz_nxv16i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: li a1, 158 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vrsub.vx v8, v8, a1 -; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vminu.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 158 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 +; CHECK-F-NEXT: li a0, 32 +; CHECK-F-NEXT: vminu.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv16i32: @@ -1110,12 +1110,12 @@ define @ctlz_nxv16i32( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: li a1, 158 -; CHECK-D-NEXT: vsrl.vi v8, v8, 23 -; CHECK-D-NEXT: vrsub.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 32 -; CHECK-D-NEXT: vminu.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 158 +; CHECK-D-NEXT: vsrl.vi v8, v8, 23 +; CHECK-D-NEXT: vrsub.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 32 +; CHECK-D-NEXT: vminu.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctlz_nxv16i32: @@ -1232,16 +1232,16 @@ define @ctlz_nxv1i64( %va) { ; CHECK-F-LABEL: ctlz_nxv1i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 -; CHECK-F-NEXT: fsrmi a1, 1 -; CHECK-F-NEXT: vsetvli a2, zero, e32, mf2, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8 -; CHECK-F-NEXT: vmv.v.x v8, a0 -; CHECK-F-NEXT: vsrl.vi v9, v9, 23 -; CHECK-F-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vmv.v.x v9, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v10, 23 +; CHECK-F-NEXT: vwsubu.vv v10, v9, v8 ; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-F-NEXT: vminu.vx v8, v10, a0 -; CHECK-F-NEXT: fsrm a1 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv1i64: @@ -1249,13 +1249,13 @@ define @ctlz_nxv1i64( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: li a1, 52 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1086 -; CHECK-D-NEXT: vrsub.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vminu.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 64 +; CHECK-D-NEXT: vminu.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctlz_nxv1i64: @@ -1372,16 +1372,16 @@ define @ctlz_nxv2i64( %va) { ; CHECK-F-LABEL: ctlz_nxv2i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 -; CHECK-F-NEXT: fsrmi a1, 1 -; CHECK-F-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 -; CHECK-F-NEXT: vmv.v.x v8, a0 -; CHECK-F-NEXT: vsrl.vi v9, v10, 23 -; CHECK-F-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-F-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vmv.v.x v10, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v11, v8 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v11, 23 +; CHECK-F-NEXT: vwsubu.vv v12, v10, v8 ; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-F-NEXT: vminu.vx v8, v10, a0 -; CHECK-F-NEXT: fsrm a1 +; CHECK-F-NEXT: vminu.vx v8, v12, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv2i64: @@ -1389,13 +1389,13 @@ define @ctlz_nxv2i64( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: li a1, 52 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1086 -; CHECK-D-NEXT: vrsub.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vminu.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 64 +; CHECK-D-NEXT: vminu.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctlz_nxv2i64: @@ -1512,16 +1512,16 @@ define @ctlz_nxv4i64( %va) { ; CHECK-F-LABEL: ctlz_nxv4i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 -; CHECK-F-NEXT: fsrmi a1, 1 -; CHECK-F-NEXT: vsetvli a2, zero, e32, m2, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8 -; CHECK-F-NEXT: vmv.v.x v8, a0 -; CHECK-F-NEXT: vsrl.vi v10, v12, 23 -; CHECK-F-NEXT: vwsubu.vv v12, v8, v10 +; CHECK-F-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vmv.v.x v12, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v14, v8 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v14, 23 +; CHECK-F-NEXT: vwsubu.vv v16, v12, v8 ; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-F-NEXT: vminu.vx v8, v12, a0 -; CHECK-F-NEXT: fsrm a1 +; CHECK-F-NEXT: vminu.vx v8, v16, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv4i64: @@ -1529,13 +1529,13 @@ define @ctlz_nxv4i64( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: li a1, 52 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1086 -; CHECK-D-NEXT: vrsub.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vminu.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 64 +; CHECK-D-NEXT: vminu.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctlz_nxv4i64: @@ -1652,16 +1652,16 @@ define @ctlz_nxv8i64( %va) { ; CHECK-F-LABEL: ctlz_nxv8i64: ; CHECK-F: # %bb.0: ; CHECK-F-NEXT: li a0, 190 -; CHECK-F-NEXT: fsrmi a1, 1 -; CHECK-F-NEXT: vsetvli a2, zero, e32, m4, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8 -; CHECK-F-NEXT: vmv.v.x v8, a0 -; CHECK-F-NEXT: vsrl.vi v12, v16, 23 -; CHECK-F-NEXT: vwsubu.vv v16, v8, v12 +; CHECK-F-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vmv.v.x v16, a0 +; CHECK-F-NEXT: fsrmi a0, 1 +; CHECK-F-NEXT: vfncvt.f.xu.w v20, v8 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v20, 23 +; CHECK-F-NEXT: vwsubu.vv v24, v16, v8 ; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-F-NEXT: vminu.vx v8, v16, a0 -; CHECK-F-NEXT: fsrm a1 +; CHECK-F-NEXT: vminu.vx v8, v24, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_nxv8i64: @@ -1669,13 +1669,13 @@ define @ctlz_nxv8i64( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: li a1, 52 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1086 -; CHECK-D-NEXT: vrsub.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vminu.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 64 +; CHECK-D-NEXT: vminu.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctlz_nxv8i64: @@ -2436,10 +2436,10 @@ define @ctlz_zero_undef_nxv1i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: li a1, 158 -; CHECK-F-NEXT: vrsub.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a0, 158 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i32: @@ -2503,10 +2503,10 @@ define @ctlz_zero_undef_nxv2i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: li a1, 158 -; CHECK-F-NEXT: vrsub.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a0, 158 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i32: @@ -2570,10 +2570,10 @@ define @ctlz_zero_undef_nxv4i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m2, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: li a1, 158 -; CHECK-F-NEXT: vrsub.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a0, 158 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i32: @@ -2637,10 +2637,10 @@ define @ctlz_zero_undef_nxv8i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m4, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: li a1, 158 -; CHECK-F-NEXT: vrsub.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a0, 158 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i32: @@ -2704,10 +2704,10 @@ define @ctlz_zero_undef_nxv16i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: li a1, 158 -; CHECK-F-NEXT: vrsub.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a0, 158 +; CHECK-F-NEXT: vrsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv16i32: @@ -2715,10 +2715,10 @@ define @ctlz_zero_undef_nxv16i32( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: vsrl.vi v8, v8, 23 -; CHECK-D-NEXT: li a1, 158 -; CHECK-D-NEXT: vrsub.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: vsrl.vi v8, v8, 23 +; CHECK-D-NEXT: li a0, 158 +; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv16i32: @@ -2838,9 +2838,9 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; CHECK-F-NEXT: vmv.v.x v9, a0 ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 +; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: vsrl.vi v10, v10, 23 ; CHECK-F-NEXT: vwsubu.vv v8, v9, v10 -; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64: @@ -2848,11 +2848,11 @@ define @ctlz_zero_undef_nxv1i64( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: li a1, 52 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1086 -; CHECK-D-NEXT: vrsub.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv1i64: @@ -2972,9 +2972,9 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; CHECK-F-NEXT: vmv.v.x v10, a0 ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vfncvt.f.xu.w v11, v8 +; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: vsrl.vi v11, v11, 23 ; CHECK-F-NEXT: vwsubu.vv v8, v10, v11 -; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64: @@ -2982,11 +2982,11 @@ define @ctlz_zero_undef_nxv2i64( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m2, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: li a1, 52 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1086 -; CHECK-D-NEXT: vrsub.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv2i64: @@ -3106,9 +3106,9 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; CHECK-F-NEXT: vmv.v.x v12, a0 ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vfncvt.f.xu.w v14, v8 +; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: vsrl.vi v14, v14, 23 ; CHECK-F-NEXT: vwsubu.vv v8, v12, v14 -; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64: @@ -3116,11 +3116,11 @@ define @ctlz_zero_undef_nxv4i64( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: li a1, 52 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1086 -; CHECK-D-NEXT: vrsub.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv4i64: @@ -3240,9 +3240,9 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; CHECK-F-NEXT: vmv.v.x v16, a0 ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vfncvt.f.xu.w v20, v8 +; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: vsrl.vi v20, v20, 23 ; CHECK-F-NEXT: vwsubu.vv v8, v16, v20 -; CHECK-F-NEXT: fsrm a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i64: @@ -3250,11 +3250,11 @@ define @ctlz_zero_undef_nxv8i64( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: li a1, 52 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1086 -; CHECK-D-NEXT: vrsub.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1086 +; CHECK-D-NEXT: vrsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: ctlz_zero_undef_nxv8i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll index 6f515996677ee..39582ee3dacae 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll @@ -940,12 +940,12 @@ define @vp_ctlz_nxv16i32( %va, @vp_ctlz_nxv16i32_unmasked( %va, i ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: li a0, 158 ; CHECK-NEXT: vsrl.vi v8, v8, 23 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i32_unmasked: @@ -988,13 +988,13 @@ define @vp_ctlz_nxv1i64( %va, @vp_ctlz_nxv1i64_unmasked( %va, i32 ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: li a0, 52 ; CHECK-NEXT: vsrl.vx v8, v8, a0 ; CHECK-NEXT: li a0, 1086 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i64_unmasked: @@ -1038,13 +1038,13 @@ define @vp_ctlz_nxv2i64( %va, @vp_ctlz_nxv2i64_unmasked( %va, i32 ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: li a0, 52 ; CHECK-NEXT: vsrl.vx v8, v8, a0 ; CHECK-NEXT: li a0, 1086 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctlz_nxv2i64_unmasked: @@ -1088,13 +1088,13 @@ define @vp_ctlz_nxv4i64( %va, @vp_ctlz_nxv4i64_unmasked( %va, i32 ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: li a0, 52 ; CHECK-NEXT: vsrl.vx v8, v8, a0 ; CHECK-NEXT: li a0, 1086 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctlz_nxv4i64_unmasked: @@ -1138,13 +1138,13 @@ define @vp_ctlz_nxv7i64( %va, @vp_ctlz_nxv7i64_unmasked( %va, i32 ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: li a0, 52 ; CHECK-NEXT: vsrl.vx v8, v8, a0 ; CHECK-NEXT: li a0, 1086 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctlz_nxv7i64_unmasked: @@ -1188,13 +1188,13 @@ define @vp_ctlz_nxv8i64( %va, @vp_ctlz_nxv8i64_unmasked( %va, i32 ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: li a0, 52 ; CHECK-NEXT: vsrl.vx v8, v8, a0 ; CHECK-NEXT: li a0, 1086 ; CHECK-NEXT: vrsub.vx v8, v8, a0 ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctlz_nxv8i64_unmasked: @@ -1258,14 +1258,14 @@ define @vp_ctlz_nxv16i64( %va, @vp_ctlz_nxv16i64_unmasked( %va, i ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vsrl.vx v8, v8, a2 ; CHECK-NEXT: vrsub.vx v8, v8, a3 ; CHECK-NEXT: vminu.vx v8, v8, a4 -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctlz_nxv16i64_unmasked: @@ -2201,10 +2201,10 @@ define @vp_ctlz_zero_undef_nxv16i32( %va, ; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: li a0, 158 ; CHECK-NEXT: vsrl.vi v8, v8, 23, v0.t ; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i32: @@ -2222,10 +2222,10 @@ define @vp_ctlz_zero_undef_nxv16i32_unmasked( @vp_ctlz_zero_undef_nxv1i64( %va, @vp_ctlz_zero_undef_nxv1i64_unmasked( @vp_ctlz_zero_undef_nxv2i64( %va, @vp_ctlz_zero_undef_nxv2i64_unmasked( @vp_ctlz_zero_undef_nxv4i64( %va, @vp_ctlz_zero_undef_nxv4i64_unmasked( @vp_ctlz_zero_undef_nxv7i64( %va, @vp_ctlz_zero_undef_nxv7i64_unmasked( @vp_ctlz_zero_undef_nxv8i64( %va, @vp_ctlz_zero_undef_nxv8i64_unmasked( @vp_ctlz_zero_undef_nxv16i64( %va, ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB94_2: -; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vsrl.vx v8, v8, a2, v0.t ; CHECK-NEXT: vrsub.vx v8, v8, a3, v0.t -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv16i64: @@ -2543,9 +2543,9 @@ define @vp_ctlz_zero_undef_nxv16i64_unmasked( @cttz_nxv1i8( %va) { ; CHECK-F-NEXT: vnsrl.wi v9, v9, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-F-NEXT: vsub.vx v9, v9, a0 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 -; CHECK-F-NEXT: vsub.vx v8, v9, a0 -; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: vmerge.vim v8, v9, 8, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv1i8: @@ -59,9 +59,9 @@ define @cttz_nxv1i8( %va) { ; CHECK-D-NEXT: vnsrl.wi v9, v9, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-D-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-D-NEXT: vsub.vx v9, v9, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 -; CHECK-D-NEXT: vsub.vx v8, v9, a0 -; CHECK-D-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-D-NEXT: vmerge.vim v8, v9, 8, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv1i8: @@ -108,9 +108,9 @@ define @cttz_nxv2i8( %va) { ; CHECK-F-NEXT: vnsrl.wi v9, v9, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-F-NEXT: vsub.vx v9, v9, a0 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 -; CHECK-F-NEXT: vsub.vx v8, v9, a0 -; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: vmerge.vim v8, v9, 8, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv2i8: @@ -125,9 +125,9 @@ define @cttz_nxv2i8( %va) { ; CHECK-D-NEXT: vnsrl.wi v9, v9, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-D-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-D-NEXT: vsub.vx v9, v9, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 -; CHECK-D-NEXT: vsub.vx v8, v9, a0 -; CHECK-D-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-D-NEXT: vmerge.vim v8, v9, 8, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv2i8: @@ -174,9 +174,9 @@ define @cttz_nxv4i8( %va) { ; CHECK-F-NEXT: vnsrl.wi v9, v12, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-F-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-F-NEXT: vsub.vx v9, v9, a0 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 -; CHECK-F-NEXT: vsub.vx v8, v9, a0 -; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: vmerge.vim v8, v9, 8, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv4i8: @@ -191,9 +191,9 @@ define @cttz_nxv4i8( %va) { ; CHECK-D-NEXT: vnsrl.wi v9, v12, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-D-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-D-NEXT: vsub.vx v9, v9, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 -; CHECK-D-NEXT: vsub.vx v8, v9, a0 -; CHECK-D-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-D-NEXT: vmerge.vim v8, v9, 8, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv4i8: @@ -240,9 +240,9 @@ define @cttz_nxv8i8( %va) { ; CHECK-F-NEXT: vnsrl.wi v10, v12, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-F-NEXT: vnsrl.wi v9, v10, 0 +; CHECK-F-NEXT: vsub.vx v9, v9, a0 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 -; CHECK-F-NEXT: vsub.vx v8, v9, a0 -; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: vmerge.vim v8, v9, 8, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv8i8: @@ -257,9 +257,9 @@ define @cttz_nxv8i8( %va) { ; CHECK-D-NEXT: vnsrl.wi v10, v12, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-D-NEXT: vnsrl.wi v9, v10, 0 +; CHECK-D-NEXT: vsub.vx v9, v9, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 -; CHECK-D-NEXT: vsub.vx v8, v9, a0 -; CHECK-D-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-D-NEXT: vmerge.vim v8, v9, 8, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv8i8: @@ -306,9 +306,9 @@ define @cttz_nxv16i8( %va) { ; CHECK-F-NEXT: vnsrl.wi v12, v16, 23 ; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-F-NEXT: vnsrl.wi v10, v12, 0 +; CHECK-F-NEXT: vsub.vx v10, v10, a0 ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 -; CHECK-F-NEXT: vsub.vx v8, v10, a0 -; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-F-NEXT: vmerge.vim v8, v10, 8, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv16i8: @@ -323,9 +323,9 @@ define @cttz_nxv16i8( %va) { ; CHECK-D-NEXT: vnsrl.wi v12, v16, 23 ; CHECK-D-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-D-NEXT: vnsrl.wi v10, v12, 0 +; CHECK-D-NEXT: vsub.vx v10, v10, a0 ; CHECK-D-NEXT: vmseq.vi v0, v8, 0 -; CHECK-D-NEXT: vsub.vx v8, v10, a0 -; CHECK-D-NEXT: vmerge.vim v8, v8, 8, v0 +; CHECK-D-NEXT: vmerge.vim v8, v10, 8, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv16i8: @@ -811,15 +811,15 @@ define @cttz_nxv1i32( %va) { ; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: li a1, 127 ; CHECK-F-NEXT: vand.vv v9, v8, v9 -; CHECK-F-NEXT: vmseq.vi v0, v8, 0 -; CHECK-F-NEXT: vfcvt.f.xu.v v8, v9 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vsub.vx v8, v8, a1 -; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vmerge.vxm v8, v8, a1, v0 +; CHECK-F-NEXT: vfcvt.f.xu.v v9, v9 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsrl.vi v8, v9, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: li a0, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv1i32: @@ -882,15 +882,15 @@ define @cttz_nxv2i32( %va) { ; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: li a1, 127 ; CHECK-F-NEXT: vand.vv v9, v8, v9 -; CHECK-F-NEXT: vmseq.vi v0, v8, 0 -; CHECK-F-NEXT: vfcvt.f.xu.v v8, v9 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vsub.vx v8, v8, a1 -; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vmerge.vxm v8, v8, a1, v0 +; CHECK-F-NEXT: vfcvt.f.xu.v v9, v9 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsrl.vi v8, v9, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: li a0, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv2i32: @@ -953,15 +953,15 @@ define @cttz_nxv4i32( %va) { ; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: li a1, 127 ; CHECK-F-NEXT: vand.vv v10, v8, v10 -; CHECK-F-NEXT: vmseq.vi v0, v8, 0 -; CHECK-F-NEXT: vfcvt.f.xu.v v8, v10 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vsub.vx v8, v8, a1 -; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vmerge.vxm v8, v8, a1, v0 +; CHECK-F-NEXT: vfcvt.f.xu.v v10, v10 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsrl.vi v8, v10, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: li a0, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv4i32: @@ -1024,15 +1024,15 @@ define @cttz_nxv8i32( %va) { ; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; CHECK-F-NEXT: vrsub.vi v12, v8, 0 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: li a1, 127 ; CHECK-F-NEXT: vand.vv v12, v8, v12 -; CHECK-F-NEXT: vmseq.vi v0, v8, 0 -; CHECK-F-NEXT: vfcvt.f.xu.v v8, v12 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vsub.vx v8, v8, a1 -; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vmerge.vxm v8, v8, a1, v0 +; CHECK-F-NEXT: vfcvt.f.xu.v v12, v12 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsrl.vi v8, v12, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: li a0, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv8i32: @@ -1095,15 +1095,15 @@ define @cttz_nxv16i32( %va) { ; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-F-NEXT: vrsub.vi v16, v8, 0 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: li a1, 127 ; CHECK-F-NEXT: vand.vv v16, v8, v16 -; CHECK-F-NEXT: vmseq.vi v0, v8, 0 -; CHECK-F-NEXT: vfcvt.f.xu.v v8, v16 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vsub.vx v8, v8, a1 -; CHECK-F-NEXT: li a1, 32 -; CHECK-F-NEXT: vmerge.vxm v8, v8, a1, v0 +; CHECK-F-NEXT: vfcvt.f.xu.v v16, v16 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vmseq.vi v0, v8, 0 +; CHECK-F-NEXT: vsrl.vi v8, v16, 23 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 +; CHECK-F-NEXT: li a0, 32 +; CHECK-F-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv16i32: @@ -1111,15 +1111,15 @@ define @cttz_nxv16i32( %va) { ; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma ; CHECK-D-NEXT: vrsub.vi v16, v8, 0 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: li a1, 127 ; CHECK-D-NEXT: vand.vv v16, v8, v16 -; CHECK-D-NEXT: vmseq.vi v0, v8, 0 -; CHECK-D-NEXT: vfcvt.f.xu.v v8, v16 -; CHECK-D-NEXT: vsrl.vi v8, v8, 23 -; CHECK-D-NEXT: vsub.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 32 -; CHECK-D-NEXT: vmerge.vxm v8, v8, a1, v0 +; CHECK-D-NEXT: vfcvt.f.xu.v v16, v16 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 127 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsrl.vi v8, v16, 23 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 32 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv16i32: @@ -1218,17 +1218,19 @@ define @cttz_nxv1i64( %va) { ; CHECK-F-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-F-NEXT: vrsub.vi v9, v8, 0 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: li a1, 127 ; CHECK-F-NEXT: vand.vv v9, v8, v9 +; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v10, v9 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 ; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v8, v9 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vwsubu.vx v9, v8, a1 -; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vsrl.vi v8, v10, 23 +; CHECK-F-NEXT: vwsubu.vx v9, v8, a0 +; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0 -; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv1i64: @@ -1236,16 +1238,16 @@ define @cttz_nxv1i64( %va) { ; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: li a1, 52 ; CHECK-D-NEXT: vand.vv v9, v8, v9 ; CHECK-D-NEXT: vfcvt.f.xu.v v9, v9 -; CHECK-D-NEXT: vsrl.vx v9, v9, a1 -; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vmseq.vi v0, v8, 0 -; CHECK-D-NEXT: vsub.vx v8, v9, a1 -; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v9, v9, a0 +; CHECK-D-NEXT: li a0, 1023 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v9, a0 +; CHECK-D-NEXT: li a0, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv1i64: @@ -1344,17 +1346,19 @@ define @cttz_nxv2i64( %va) { ; CHECK-F-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-F-NEXT: vrsub.vi v10, v8, 0 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: li a1, 127 ; CHECK-F-NEXT: vand.vv v10, v8, v10 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v12, v10 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 ; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v8, v10 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vwsubu.vx v10, v8, a1 -; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vsrl.vi v8, v12, 23 +; CHECK-F-NEXT: vwsubu.vx v10, v8, a0 +; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0 -; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vmerge.vxm v8, v10, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv2i64: @@ -1362,16 +1366,16 @@ define @cttz_nxv2i64( %va) { ; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: li a1, 52 ; CHECK-D-NEXT: vand.vv v10, v8, v10 ; CHECK-D-NEXT: vfcvt.f.xu.v v10, v10 -; CHECK-D-NEXT: vsrl.vx v10, v10, a1 -; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vmseq.vi v0, v8, 0 -; CHECK-D-NEXT: vsub.vx v8, v10, a1 -; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v10, v10, a0 +; CHECK-D-NEXT: li a0, 1023 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v10, a0 +; CHECK-D-NEXT: li a0, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv2i64: @@ -1470,17 +1474,19 @@ define @cttz_nxv4i64( %va) { ; CHECK-F-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-F-NEXT: vrsub.vi v12, v8, 0 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: li a1, 127 ; CHECK-F-NEXT: vand.vv v12, v8, v12 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v16, v12 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 ; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v8, v12 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vwsubu.vx v12, v8, a1 -; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vsrl.vi v8, v16, 23 +; CHECK-F-NEXT: vwsubu.vx v12, v8, a0 +; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0 -; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vmerge.vxm v8, v12, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv4i64: @@ -1488,16 +1494,16 @@ define @cttz_nxv4i64( %va) { ; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-D-NEXT: vrsub.vi v12, v8, 0 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: li a1, 52 ; CHECK-D-NEXT: vand.vv v12, v8, v12 ; CHECK-D-NEXT: vfcvt.f.xu.v v12, v12 -; CHECK-D-NEXT: vsrl.vx v12, v12, a1 -; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vmseq.vi v0, v8, 0 -; CHECK-D-NEXT: vsub.vx v8, v12, a1 -; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v12, v12, a0 +; CHECK-D-NEXT: li a0, 1023 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v12, a0 +; CHECK-D-NEXT: li a0, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv4i64: @@ -1596,17 +1602,19 @@ define @cttz_nxv8i64( %va) { ; CHECK-F-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-F-NEXT: vrsub.vi v16, v8, 0 ; CHECK-F-NEXT: fsrmi a0, 1 -; CHECK-F-NEXT: li a1, 127 ; CHECK-F-NEXT: vand.vv v16, v8, v16 +; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-F-NEXT: vfncvt.f.xu.w v24, v16 +; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-F-NEXT: vmseq.vi v0, v8, 0 ; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-F-NEXT: vfncvt.f.xu.w v8, v16 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: vwsubu.vx v16, v8, a1 -; CHECK-F-NEXT: li a1, 64 +; CHECK-F-NEXT: vsrl.vi v8, v24, 23 +; CHECK-F-NEXT: vwsubu.vx v16, v8, a0 +; CHECK-F-NEXT: li a0, 64 ; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0 -; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vmerge.vxm v8, v16, a0, v0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_nxv8i64: @@ -1614,16 +1622,16 @@ define @cttz_nxv8i64( %va) { ; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-D-NEXT: vrsub.vi v16, v8, 0 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: li a1, 52 ; CHECK-D-NEXT: vand.vv v16, v8, v16 ; CHECK-D-NEXT: vfcvt.f.xu.v v16, v16 -; CHECK-D-NEXT: vsrl.vx v16, v16, a1 -; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vmseq.vi v0, v8, 0 -; CHECK-D-NEXT: vsub.vx v8, v16, a1 -; CHECK-D-NEXT: li a1, 64 -; CHECK-D-NEXT: vmerge.vxm v8, v8, a1, v0 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v16, v16, a0 +; CHECK-D-NEXT: li a0, 1023 +; CHECK-D-NEXT: vmseq.vi v0, v8, 0 +; CHECK-D-NEXT: vsub.vx v8, v16, a0 +; CHECK-D-NEXT: li a0, 64 +; CHECK-D-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_nxv8i64: @@ -2378,10 +2386,10 @@ define @cttz_zero_undef_nxv1i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vand.vv v8, v8, v9 ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vsub.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv1i32: @@ -2442,10 +2450,10 @@ define @cttz_zero_undef_nxv2i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vand.vv v8, v8, v9 ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vsub.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv2i32: @@ -2506,10 +2514,10 @@ define @cttz_zero_undef_nxv4i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vand.vv v8, v8, v10 ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vsub.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv4i32: @@ -2570,10 +2578,10 @@ define @cttz_zero_undef_nxv8i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vand.vv v8, v8, v12 ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vsub.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv8i32: @@ -2634,10 +2642,10 @@ define @cttz_zero_undef_nxv16i32( %va) { ; CHECK-F-NEXT: fsrmi a0, 1 ; CHECK-F-NEXT: vand.vv v8, v8, v16 ; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-F-NEXT: vsrl.vi v8, v8, 23 -; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vsub.vx v8, v8, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v8, v8, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vsub.vx v8, v8, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv16i32: @@ -2647,10 +2655,10 @@ define @cttz_zero_undef_nxv16i32( %va) { ; CHECK-D-NEXT: fsrmi a0, 1 ; CHECK-D-NEXT: vand.vv v8, v8, v16 ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: vsrl.vi v8, v8, 23 -; CHECK-D-NEXT: li a1, 127 -; CHECK-D-NEXT: vsub.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: vsrl.vi v8, v8, 23 +; CHECK-D-NEXT: li a0, 127 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv16i32: @@ -2751,10 +2759,10 @@ define @cttz_zero_undef_nxv1i64( %va) { ; CHECK-F-NEXT: vand.vv v8, v8, v9 ; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8 -; CHECK-F-NEXT: vsrl.vi v9, v9, 23 -; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vwsubu.vx v8, v9, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v9, v9, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vwsubu.vx v8, v9, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv1i64: @@ -2762,13 +2770,13 @@ define @cttz_zero_undef_nxv1i64( %va) { ; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-D-NEXT: vrsub.vi v9, v8, 0 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: li a1, 52 ; CHECK-D-NEXT: vand.vv v8, v8, v9 ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vsub.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1023 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv1i64: @@ -2869,10 +2877,10 @@ define @cttz_zero_undef_nxv2i64( %va) { ; CHECK-F-NEXT: vand.vv v8, v8, v10 ; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8 -; CHECK-F-NEXT: vsrl.vi v10, v10, 23 -; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vwsubu.vx v8, v10, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v10, v10, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vwsubu.vx v8, v10, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv2i64: @@ -2880,13 +2888,13 @@ define @cttz_zero_undef_nxv2i64( %va) { ; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-D-NEXT: vrsub.vi v10, v8, 0 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: li a1, 52 ; CHECK-D-NEXT: vand.vv v8, v8, v10 ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vsub.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1023 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv2i64: @@ -2987,10 +2995,10 @@ define @cttz_zero_undef_nxv4i64( %va) { ; CHECK-F-NEXT: vand.vv v8, v8, v12 ; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8 -; CHECK-F-NEXT: vsrl.vi v12, v12, 23 -; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vwsubu.vx v8, v12, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v12, v12, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vwsubu.vx v8, v12, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv4i64: @@ -2998,13 +3006,13 @@ define @cttz_zero_undef_nxv4i64( %va) { ; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-D-NEXT: vrsub.vi v12, v8, 0 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: li a1, 52 ; CHECK-D-NEXT: vand.vv v8, v8, v12 ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vsub.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1023 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv4i64: @@ -3105,10 +3113,10 @@ define @cttz_zero_undef_nxv8i64( %va) { ; CHECK-F-NEXT: vand.vv v8, v8, v16 ; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8 -; CHECK-F-NEXT: vsrl.vi v16, v16, 23 -; CHECK-F-NEXT: li a1, 127 -; CHECK-F-NEXT: vwsubu.vx v8, v16, a1 ; CHECK-F-NEXT: fsrm a0 +; CHECK-F-NEXT: vsrl.vi v16, v16, 23 +; CHECK-F-NEXT: li a0, 127 +; CHECK-F-NEXT: vwsubu.vx v8, v16, a0 ; CHECK-F-NEXT: ret ; ; CHECK-D-LABEL: cttz_zero_undef_nxv8i64: @@ -3116,13 +3124,13 @@ define @cttz_zero_undef_nxv8i64( %va) { ; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-D-NEXT: vrsub.vi v16, v8, 0 ; CHECK-D-NEXT: fsrmi a0, 1 -; CHECK-D-NEXT: li a1, 52 ; CHECK-D-NEXT: vand.vv v8, v8, v16 ; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8 -; CHECK-D-NEXT: vsrl.vx v8, v8, a1 -; CHECK-D-NEXT: li a1, 1023 -; CHECK-D-NEXT: vsub.vx v8, v8, a1 ; CHECK-D-NEXT: fsrm a0 +; CHECK-D-NEXT: li a0, 52 +; CHECK-D-NEXT: vsrl.vx v8, v8, a0 +; CHECK-D-NEXT: li a0, 1023 +; CHECK-D-NEXT: vsub.vx v8, v8, a0 ; CHECK-D-NEXT: ret ; ; CHECK-ZVBB-LABEL: cttz_zero_undef_nxv8i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll index 766717d92a749..60ea1881ed213 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll @@ -3708,12 +3708,12 @@ define @vp_cttz_zero_undef_nxv16i32( %va, ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vrsub.vi v16, v8, 0, v0.t ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: li a1, 127 ; CHECK-NEXT: vand.vv v8, v8, v16, v0.t ; CHECK-NEXT: vfcvt.f.xu.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 23, v0.t -; CHECK-NEXT: vsub.vx v8, v8, a1, v0.t ; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: li a0, 127 +; CHECK-NEXT: vsrl.vi v8, v8, 23, v0.t +; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i32: @@ -3733,10 +3733,10 @@ define @vp_cttz_zero_undef_nxv16i32_unmasked( @vp_cttz_zero_undef_nxv1i64( %va, @vp_cttz_zero_undef_nxv1i64_unmasked( @vp_cttz_zero_undef_nxv2i64( %va, @vp_cttz_zero_undef_nxv2i64_unmasked( @vp_cttz_zero_undef_nxv4i64( %va, @vp_cttz_zero_undef_nxv4i64_unmasked( @vp_cttz_zero_undef_nxv7i64( %va, @vp_cttz_zero_undef_nxv7i64_unmasked( @vp_cttz_zero_undef_nxv8i64( %va, @vp_cttz_zero_undef_nxv8i64_unmasked( @vp_cttz_zero_undef_nxv16i64( %va, ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vand.vv v8, v8, v16, v0.t ; CHECK-NEXT: vfcvt.f.xu.v v8, v8, v0.t +; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vsrl.vx v8, v8, a2, v0.t ; CHECK-NEXT: vsub.vx v8, v8, a3, v0.t -; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb @@ -4104,12 +4104,12 @@ define @vp_cttz_zero_undef_nxv16i64_unmasked(This Inner Loop Header: Depth=1 ; NO-SINK-NEXT: vl1re32.v v9, (a5) ; NO-SINK-NEXT: sub a6, a6, a3 ; NO-SINK-NEXT: vfadd.vv v9, v9, v8 ; NO-SINK-NEXT: vs1r.v v9, (a5) -; NO-SINK-NEXT: add a5, a5, a1 +; NO-SINK-NEXT: add a5, a5, a2 ; NO-SINK-NEXT: bnez a6, .LBB4_3 ; NO-SINK-NEXT: # %bb.4: # %middle.block ; NO-SINK-NEXT: beqz a4, .LBB4_7 ; NO-SINK-NEXT: .LBB4_5: # %for.body.preheader -; NO-SINK-NEXT: slli a1, a2, 2 +; NO-SINK-NEXT: slli a1, a1, 2 ; NO-SINK-NEXT: lui a2, 1 ; NO-SINK-NEXT: add a1, a0, a1 ; NO-SINK-NEXT: add a0, a0, a2 @@ -448,19 +448,19 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; ; SINK-LABEL: sink_splat_fadd_scalable: ; SINK: # %bb.0: # %entry -; SINK-NEXT: csrr a1, vlenb -; SINK-NEXT: srli a3, a1, 2 -; SINK-NEXT: li a2, 1024 -; SINK-NEXT: bgeu a2, a3, .LBB4_2 +; SINK-NEXT: csrr a2, vlenb +; SINK-NEXT: srli a3, a2, 2 +; SINK-NEXT: li a1, 1024 +; SINK-NEXT: bgeu a1, a3, .LBB4_2 ; SINK-NEXT: # %bb.1: -; SINK-NEXT: li a2, 0 +; SINK-NEXT: li a1, 0 ; SINK-NEXT: j .LBB4_5 ; SINK-NEXT: .LBB4_2: # %vector.ph -; SINK-NEXT: addi a2, a3, -1 -; SINK-NEXT: andi a4, a2, 1024 -; SINK-NEXT: xori a2, a4, 1024 +; SINK-NEXT: addi a1, a3, -1 +; SINK-NEXT: andi a4, a1, 1024 +; SINK-NEXT: xori a1, a4, 1024 ; SINK-NEXT: mv a5, a0 -; SINK-NEXT: mv a6, a2 +; SINK-NEXT: mv a6, a1 ; SINK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; SINK-NEXT: .LBB4_3: # %vector.body ; SINK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -468,12 +468,12 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; SINK-NEXT: sub a6, a6, a3 ; SINK-NEXT: vfadd.vf v8, v8, fa0 ; SINK-NEXT: vs1r.v v8, (a5) -; SINK-NEXT: add a5, a5, a1 +; SINK-NEXT: add a5, a5, a2 ; SINK-NEXT: bnez a6, .LBB4_3 ; SINK-NEXT: # %bb.4: # %middle.block ; SINK-NEXT: beqz a4, .LBB4_7 ; SINK-NEXT: .LBB4_5: # %for.body.preheader -; SINK-NEXT: slli a1, a2, 2 +; SINK-NEXT: slli a1, a1, 2 ; SINK-NEXT: lui a2, 1 ; SINK-NEXT: add a1, a0, a1 ; SINK-NEXT: add a0, a0, a2 @@ -489,19 +489,19 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; ; DEFAULT-LABEL: sink_splat_fadd_scalable: ; DEFAULT: # %bb.0: # %entry -; DEFAULT-NEXT: csrr a1, vlenb -; DEFAULT-NEXT: srli a3, a1, 2 -; DEFAULT-NEXT: li a2, 1024 -; DEFAULT-NEXT: bgeu a2, a3, .LBB4_2 +; DEFAULT-NEXT: csrr a2, vlenb +; DEFAULT-NEXT: srli a3, a2, 2 +; DEFAULT-NEXT: li a1, 1024 +; DEFAULT-NEXT: bgeu a1, a3, .LBB4_2 ; DEFAULT-NEXT: # %bb.1: -; DEFAULT-NEXT: li a2, 0 +; DEFAULT-NEXT: li a1, 0 ; DEFAULT-NEXT: j .LBB4_5 ; DEFAULT-NEXT: .LBB4_2: # %vector.ph -; DEFAULT-NEXT: addi a2, a3, -1 -; DEFAULT-NEXT: andi a4, a2, 1024 -; DEFAULT-NEXT: xori a2, a4, 1024 +; DEFAULT-NEXT: addi a1, a3, -1 +; DEFAULT-NEXT: andi a4, a1, 1024 +; DEFAULT-NEXT: xori a1, a4, 1024 ; DEFAULT-NEXT: mv a5, a0 -; DEFAULT-NEXT: mv a6, a2 +; DEFAULT-NEXT: mv a6, a1 ; DEFAULT-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; DEFAULT-NEXT: .LBB4_3: # %vector.body ; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 @@ -509,12 +509,12 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; DEFAULT-NEXT: sub a6, a6, a3 ; DEFAULT-NEXT: vfadd.vf v8, v8, fa0 ; DEFAULT-NEXT: vs1r.v v8, (a5) -; DEFAULT-NEXT: add a5, a5, a1 +; DEFAULT-NEXT: add a5, a5, a2 ; DEFAULT-NEXT: bnez a6, .LBB4_3 ; DEFAULT-NEXT: # %bb.4: # %middle.block ; DEFAULT-NEXT: beqz a4, .LBB4_7 ; DEFAULT-NEXT: .LBB4_5: # %for.body.preheader -; DEFAULT-NEXT: slli a1, a2, 2 +; DEFAULT-NEXT: slli a1, a1, 2 ; DEFAULT-NEXT: lui a2, 1 ; DEFAULT-NEXT: add a1, a0, a1 ; DEFAULT-NEXT: add a0, a0, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll index 8c63c2d4be8c1..ec8580e0b6f12 100644 --- a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll @@ -497,12 +497,12 @@ declare @llvm.ceil.nxv1f64() define @ceil_nxv1f64_to_si8( %x) { ; RV32-LABEL: ceil_nxv1f64_to_si8: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI16_0) -; RV32-NEXT: fld fa5, %lo(.LCPI16_0)(a0) ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32-NEXT: vfabs.v v9, v8 -; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: lui a0, %hi(.LCPI16_0) +; RV32-NEXT: fld fa5, %lo(.LCPI16_0)(a0) ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v9, fa5 ; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -518,12 +518,12 @@ define @ceil_nxv1f64_to_si8( %x) { ; ; RV64-LABEL: ceil_nxv1f64_to_si8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI16_0) -; RV64-NEXT: fld fa5, %lo(.LCPI16_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 -; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: lui a0, %hi(.LCPI16_0) +; RV64-NEXT: fld fa5, %lo(.LCPI16_0)(a0) ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -544,12 +544,12 @@ define @ceil_nxv1f64_to_si8( %x) { define @ceil_nxv1f64_to_ui8( %x) { ; RV32-LABEL: ceil_nxv1f64_to_ui8: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI17_0) -; RV32-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32-NEXT: vfabs.v v9, v8 -; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: lui a0, %hi(.LCPI17_0) +; RV32-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v9, fa5 ; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -565,12 +565,12 @@ define @ceil_nxv1f64_to_ui8( %x) { ; ; RV64-LABEL: ceil_nxv1f64_to_ui8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI17_0) -; RV64-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 -; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: lui a0, %hi(.LCPI17_0) +; RV64-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -591,12 +591,12 @@ define @ceil_nxv1f64_to_ui8( %x) { define @ceil_nxv1f64_to_si16( %x) { ; RV32-LABEL: ceil_nxv1f64_to_si16: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI18_0) -; RV32-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32-NEXT: vfabs.v v9, v8 -; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: lui a0, %hi(.LCPI18_0) +; RV32-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v9, fa5 ; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -610,12 +610,12 @@ define @ceil_nxv1f64_to_si16( %x) { ; ; RV64-LABEL: ceil_nxv1f64_to_si16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI18_0) -; RV64-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 -; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: lui a0, %hi(.LCPI18_0) +; RV64-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -634,12 +634,12 @@ define @ceil_nxv1f64_to_si16( %x) { define @ceil_nxv1f64_to_ui16( %x) { ; RV32-LABEL: ceil_nxv1f64_to_ui16: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI19_0) -; RV32-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV32-NEXT: vfabs.v v9, v8 -; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: lui a0, %hi(.LCPI19_0) +; RV32-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v9, fa5 ; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -653,12 +653,12 @@ define @ceil_nxv1f64_to_ui16( %x) { ; ; RV64-LABEL: ceil_nxv1f64_to_ui16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI19_0) -; RV64-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 -; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: lui a0, %hi(.LCPI19_0) +; RV64-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -771,12 +771,12 @@ declare @llvm.ceil.nxv4f64() define @ceil_nxv4f64_to_si8( %x) { ; RV32-LABEL: ceil_nxv4f64_to_si8: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI24_0) -; RV32-NEXT: fld fa5, %lo(.LCPI24_0)(a0) ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vfabs.v v12, v8 -; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: lui a0, %hi(.LCPI24_0) +; RV32-NEXT: fld fa5, %lo(.LCPI24_0)(a0) ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v12, fa5 ; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -792,12 +792,12 @@ define @ceil_nxv4f64_to_si8( %x) { ; ; RV64-LABEL: ceil_nxv4f64_to_si8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI24_0) -; RV64-NEXT: fld fa5, %lo(.LCPI24_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 -; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: lui a0, %hi(.LCPI24_0) +; RV64-NEXT: fld fa5, %lo(.LCPI24_0)(a0) ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -818,12 +818,12 @@ define @ceil_nxv4f64_to_si8( %x) { define @ceil_nxv4f64_to_ui8( %x) { ; RV32-LABEL: ceil_nxv4f64_to_ui8: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI25_0) -; RV32-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vfabs.v v12, v8 -; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: lui a0, %hi(.LCPI25_0) +; RV32-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v12, fa5 ; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -839,12 +839,12 @@ define @ceil_nxv4f64_to_ui8( %x) { ; ; RV64-LABEL: ceil_nxv4f64_to_ui8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI25_0) -; RV64-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 -; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: lui a0, %hi(.LCPI25_0) +; RV64-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -865,12 +865,12 @@ define @ceil_nxv4f64_to_ui8( %x) { define @ceil_nxv4f64_to_si16( %x) { ; RV32-LABEL: ceil_nxv4f64_to_si16: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI26_0) -; RV32-NEXT: fld fa5, %lo(.LCPI26_0)(a0) ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vfabs.v v12, v8 -; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: lui a0, %hi(.LCPI26_0) +; RV32-NEXT: fld fa5, %lo(.LCPI26_0)(a0) ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v12, fa5 ; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -884,12 +884,12 @@ define @ceil_nxv4f64_to_si16( %x) { ; ; RV64-LABEL: ceil_nxv4f64_to_si16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI26_0) -; RV64-NEXT: fld fa5, %lo(.LCPI26_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 -; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: lui a0, %hi(.LCPI26_0) +; RV64-NEXT: fld fa5, %lo(.LCPI26_0)(a0) ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -908,12 +908,12 @@ define @ceil_nxv4f64_to_si16( %x) { define @ceil_nxv4f64_to_ui16( %x) { ; RV32-LABEL: ceil_nxv4f64_to_ui16: ; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI27_0) -; RV32-NEXT: fld fa5, %lo(.LCPI27_0)(a0) ; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV32-NEXT: vfabs.v v12, v8 -; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: lui a0, %hi(.LCPI27_0) +; RV32-NEXT: fld fa5, %lo(.LCPI27_0)(a0) ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v12, fa5 ; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -927,12 +927,12 @@ define @ceil_nxv4f64_to_ui16( %x) { ; ; RV64-LABEL: ceil_nxv4f64_to_ui16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI27_0) -; RV64-NEXT: fld fa5, %lo(.LCPI27_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 -; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: lui a0, %hi(.LCPI27_0) +; RV64-NEXT: fld fa5, %lo(.LCPI27_0)(a0) ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll index a35cf14203f78..51c70a32ccac8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/expandload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll @@ -136,12 +136,12 @@ define <32 x i8> @test_expandload_v32i8(ptr %base, <32 x i1> %mask, <32 x i8> %p ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: viota.m v10, v0 ; CHECK-NEXT: vcpop.m a2, v0 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vle8.v v12, (a0) ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: viota.m v12, v0 -; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: vrgather.vv v8, v12, v10, v0.t ; CHECK-NEXT: ret %res = call <32 x i8> @llvm.masked.expandload.v32i8(ptr align 1 %base, <32 x i1> %mask, <32 x i8> %passthru) ret <32 x i8> %res @@ -163,12 +163,12 @@ define <64 x i8> @test_expandload_v64i8(ptr %base, <64 x i1> %mask, <64 x i8> %p ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: viota.m v12, v0 ; CHECK-NEXT: vcpop.m a2, v0 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: viota.m v16, v0 -; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t +; CHECK-NEXT: vrgather.vv v8, v16, v12, v0.t ; CHECK-NEXT: ret %res = call <64 x i8> @llvm.masked.expandload.v64i8(ptr align 1 %base, <64 x i1> %mask, <64 x i8> %passthru) ret <64 x i8> %res @@ -190,12 +190,12 @@ define <128 x i8> @test_expandload_v128i8(ptr %base, <128 x i1> %mask, <128 x i8 ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 128 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: viota.m v16, v0 ; CHECK-NEXT: vcpop.m a2, v0 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a0) ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: viota.m v24, v0 -; CHECK-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-NEXT: vrgather.vv v8, v24, v16, v0.t ; CHECK-NEXT: ret %res = call <128 x i8> @llvm.masked.expandload.v128i8(ptr align 1 %base, <128 x i1> %mask, <128 x i8> %passthru) ret <128 x i8> %res @@ -218,106 +218,71 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8 ; CHECK-RV32-NEXT: addi sp, sp, -16 ; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV32-NEXT: csrr a2, vlenb -; CHECK-RV32-NEXT: slli a2, a2, 5 +; CHECK-RV32-NEXT: slli a2, a2, 4 ; CHECK-RV32-NEXT: sub sp, sp, a2 -; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-RV32-NEXT: csrr a2, vlenb -; CHECK-RV32-NEXT: li a3, 24 -; CHECK-RV32-NEXT: mul a2, a2, a3 +; CHECK-RV32-NEXT: slli a2, a2, 3 ; CHECK-RV32-NEXT: add a2, sp, a2 ; CHECK-RV32-NEXT: addi a2, a2, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vmv1r.v v7, v8 ; CHECK-RV32-NEXT: li a2, 128 -; CHECK-RV32-NEXT: vslidedown.vi v9, v0, 1 +; CHECK-RV32-NEXT: vslidedown.vi v6, v0, 1 ; CHECK-RV32-NEXT: li a3, 32 ; CHECK-RV32-NEXT: vmv.x.s a4, v0 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: addi a5, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vcpop.m a5, v0 +; CHECK-RV32-NEXT: vsetvli zero, a5, e8, m8, ta, ma +; CHECK-RV32-NEXT: vle8.v v24, (a0) +; CHECK-RV32-NEXT: csrr a5, vlenb +; CHECK-RV32-NEXT: slli a5, a5, 3 +; CHECK-RV32-NEXT: add a5, sp, a5 +; CHECK-RV32-NEXT: addi a5, a5, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: addi a5, sp, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a5) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV32-NEXT: csrr a5, vlenb +; CHECK-RV32-NEXT: slli a5, a5, 3 +; CHECK-RV32-NEXT: add a5, sp, a5 +; CHECK-RV32-NEXT: addi a5, a5, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vle8.v v16, (a1) -; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 3 -; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: addi a1, a1, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsrl.vx v10, v9, a3 +; CHECK-RV32-NEXT: vsrl.vx v10, v6, a3 ; CHECK-RV32-NEXT: vsrl.vx v11, v0, a3 -; CHECK-RV32-NEXT: vmv.x.s a1, v9 -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-RV32-NEXT: vcpop.m a3, v0 -; CHECK-RV32-NEXT: cpop a4, a4 -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.x.s a5, v10 -; CHECK-RV32-NEXT: vmv.x.s a6, v11 -; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; CHECK-RV32-NEXT: vle8.v v8, (a0) -; CHECK-RV32-NEXT: csrr a3, vlenb -; CHECK-RV32-NEXT: slli a3, a3, 4 -; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: addi a3, a3, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vmv.x.s a1, v6 +; CHECK-RV32-NEXT: cpop a3, a4 +; CHECK-RV32-NEXT: vmv.x.s a4, v10 +; CHECK-RV32-NEXT: vmv.x.s a5, v11 ; CHECK-RV32-NEXT: cpop a1, a1 -; CHECK-RV32-NEXT: cpop a3, a6 ; CHECK-RV32-NEXT: cpop a5, a5 -; CHECK-RV32-NEXT: add a3, a4, a3 -; CHECK-RV32-NEXT: add a1, a1, a5 +; CHECK-RV32-NEXT: cpop a4, a4 +; CHECK-RV32-NEXT: add a3, a3, a5 +; CHECK-RV32-NEXT: add a1, a1, a4 ; CHECK-RV32-NEXT: add a1, a3, a1 ; CHECK-RV32-NEXT: add a0, a0, a1 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-RV32-NEXT: vcpop.m a1, v7 ; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; CHECK-RV32-NEXT: vle8.v v8, (a0) -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vle8.v v24, (a0) ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v24, v0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v16, v7 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v8, v7 ; CHECK-RV32-NEXT: vmv1r.v v0, v7 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add sp, sp, a0 ; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -329,38 +294,50 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8 ; CHECK-RV64-NEXT: addi sp, sp, -16 ; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV64-NEXT: csrr a2, vlenb -; CHECK-RV64-NEXT: slli a2, a2, 5 -; CHECK-RV64-NEXT: sub sp, sp, a2 -; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-RV64-NEXT: csrr a2, vlenb ; CHECK-RV64-NEXT: li a3, 24 ; CHECK-RV64-NEXT: mul a2, a2, a3 +; CHECK-RV64-NEXT: sub sp, sp, a2 +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: slli a2, a2, 4 ; CHECK-RV64-NEXT: add a2, sp, a2 ; CHECK-RV64-NEXT: addi a2, a2, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vmv1r.v v7, v8 ; CHECK-RV64-NEXT: li a2, 128 -; CHECK-RV64-NEXT: vslidedown.vi v9, v0, 1 +; CHECK-RV64-NEXT: vslidedown.vi v6, v0, 1 ; CHECK-RV64-NEXT: vmv.x.s a3, v0 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-RV64-NEXT: vle8.v v16, (a1) -; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: slli a1, a1, 3 -; CHECK-RV64-NEXT: add a1, sp, a1 -; CHECK-RV64-NEXT: addi a1, a1, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vmv.x.s a1, v9 -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: slli a4, a4, 3 +; CHECK-RV64-NEXT: add a4, sp, a4 +; CHECK-RV64-NEXT: addi a4, a4, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vcpop.m a4, v0 ; CHECK-RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma -; CHECK-RV64-NEXT: vle8.v v8, (a0) +; CHECK-RV64-NEXT: vle8.v v16, (a0) +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: slli a4, a4, 3 +; CHECK-RV64-NEXT: add a4, sp, a4 +; CHECK-RV64-NEXT: addi a4, a4, 16 +; CHECK-RV64-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: slli a4, a4, 4 +; CHECK-RV64-NEXT: add a4, sp, a4 +; CHECK-RV64-NEXT: addi a4, a4, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t ; CHECK-RV64-NEXT: csrr a4, vlenb ; CHECK-RV64-NEXT: slli a4, a4, 4 ; CHECK-RV64-NEXT: add a4, sp, a4 ; CHECK-RV64-NEXT: addi a4, a4, 16 ; CHECK-RV64-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vle8.v v16, (a1) +; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a1, v6 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-RV64-NEXT: vcpop.m a4, v7 ; CHECK-RV64-NEXT: cpop a3, a3 @@ -372,53 +349,29 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8 ; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v24, v0 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: viota.m v24, v7 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v16, v7 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vmv1r.v v0, v7 ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: addi a0, sp, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vrgather.vv v16, v8, v24, v0.t ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: li a1, 24 ; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 5 ; CHECK-RV64-NEXT: add sp, sp, a0 ; CHECK-RV64-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -608,13 +561,13 @@ define <32 x i16> @test_expandload_v32i16(ptr %base, <32 x i1> %mask, <32 x i16> ; CHECK-LABEL: test_expandload_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: viota.m v12, v0 ; CHECK-NEXT: vcpop.m a2, v0 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: viota.m v16, v0 -; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t +; CHECK-NEXT: vrgather.vv v8, v16, v12, v0.t ; CHECK-NEXT: ret %res = call <32 x i16> @llvm.masked.expandload.v32i16(ptr align 2 %base, <32 x i1> %mask, <32 x i16> %passthru) ret <32 x i16> %res @@ -635,13 +588,13 @@ define <64 x i16> @test_expandload_v64i16(ptr %base, <64 x i1> %mask, <64 x i16> ; CHECK-LABEL: test_expandload_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: viota.m v16, v0 ; CHECK-NEXT: vcpop.m a2, v0 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a0) ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-NEXT: viota.m v24, v0 -; CHECK-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-NEXT: vrgather.vv v8, v24, v16, v0.t ; CHECK-NEXT: ret %res = call <64 x i16> @llvm.masked.expandload.v64i16(ptr align 2 %base, <64 x i1> %mask, <64 x i16> %passthru) ret <64 x i16> %res @@ -664,76 +617,66 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x ; CHECK-RV32-NEXT: addi sp, sp, -16 ; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 5 -; CHECK-RV32-NEXT: sub sp, sp, a1 -; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-RV32-NEXT: csrr a1, vlenb ; CHECK-RV32-NEXT: li a2, 24 ; CHECK-RV32-NEXT: mul a1, a1, a2 +; CHECK-RV32-NEXT: sub sp, sp, a1 +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: slli a1, a1, 4 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: li a1, 64 +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: vcpop.m a2, v0 +; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-RV32-NEXT: vle16.v v24, (a0) +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: slli a2, a2, 3 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: addi a2, a2, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 8 +; CHECK-RV32-NEXT: vslidedown.vi v24, v0, 8 ; CHECK-RV32-NEXT: li a2, 32 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-RV32-NEXT: vmv.x.s a3, v0 ; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-RV32-NEXT: vcpop.m a4, v0 +; CHECK-RV32-NEXT: vcpop.m a4, v24 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsrl.vx v25, v0, a2 -; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-RV32-NEXT: vcpop.m a2, v7 -; CHECK-RV32-NEXT: vsetvli zero, a4, e16, m8, ta, ma -; CHECK-RV32-NEXT: vle16.v v16, (a0) -; CHECK-RV32-NEXT: csrr a5, vlenb -; CHECK-RV32-NEXT: slli a5, a5, 4 -; CHECK-RV32-NEXT: add a5, sp, a5 -; CHECK-RV32-NEXT: addi a5, a5, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.x.s a4, v25 -; CHECK-RV32-NEXT: cpop a4, a4 +; CHECK-RV32-NEXT: vsrl.vx v8, v0, a2 +; CHECK-RV32-NEXT: cpop a2, a3 +; CHECK-RV32-NEXT: vmv.x.s a3, v8 ; CHECK-RV32-NEXT: cpop a3, a3 -; CHECK-RV32-NEXT: add a3, a3, a4 -; CHECK-RV32-NEXT: slli a3, a3, 1 -; CHECK-RV32-NEXT: add a0, a0, a3 -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-RV32-NEXT: vle16.v v16, (a0) -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v16, v0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: add a2, a2, a3 +; CHECK-RV32-NEXT: slli a2, a2, 1 +; CHECK-RV32-NEXT: add a0, a0, a2 +; CHECK-RV32-NEXT: vsetvli zero, a4, e16, m8, ta, ma +; CHECK-RV32-NEXT: vle16.v v8, (a0) ; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v8, v7 -; CHECK-RV32-NEXT: vmv1r.v v0, v7 +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-RV32-NEXT: viota.m v8, v24 +; CHECK-RV32-NEXT: vmv1r.v v0, v24 ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t -; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 ; CHECK-RV32-NEXT: add sp, sp, a0 ; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -749,50 +692,58 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x ; CHECK-RV64-NEXT: sub sp, sp, a1 ; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: slli a1, a1, 3 +; CHECK-RV64-NEXT: slli a1, a1, 4 ; CHECK-RV64-NEXT: add a1, sp, a1 ; CHECK-RV64-NEXT: addi a1, a1, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: li a1, 64 -; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 8 -; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: li a3, 24 +; CHECK-RV64-NEXT: mul a2, a2, a3 +; CHECK-RV64-NEXT: add a2, sp, a2 +; CHECK-RV64-NEXT: addi a2, a2, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vcpop.m a2, v0 -; CHECK-RV64-NEXT: vcpop.m a3, v7 ; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-RV64-NEXT: vle16.v v24, (a0) -; CHECK-RV64-NEXT: csrr a4, vlenb -; CHECK-RV64-NEXT: slli a4, a4, 4 -; CHECK-RV64-NEXT: add a4, sp, a4 -; CHECK-RV64-NEXT: addi a4, a4, 16 -; CHECK-RV64-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: csrr a3, vlenb +; CHECK-RV64-NEXT: li a4, 24 +; CHECK-RV64-NEXT: mul a3, a3, a4 +; CHECK-RV64-NEXT: add a3, sp, a3 +; CHECK-RV64-NEXT: addi a3, a3, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV64-NEXT: addi a3, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v0, v0, 8 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-RV64-NEXT: vcpop.m a3, v0 ; CHECK-RV64-NEXT: slli a2, a2, 1 ; CHECK-RV64-NEXT: add a0, a0, a2 ; CHECK-RV64-NEXT: vsetvli zero, a3, e16, m8, ta, ma -; CHECK-RV64-NEXT: vle16.v v24, (a0) +; CHECK-RV64-NEXT: vle16.v v16, (a0) ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: li a2, 24 ; CHECK-RV64-NEXT: mul a0, a0, a2 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v24, v0 +; CHECK-RV64-NEXT: viota.m v16, v0 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v16, v7 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vmv1r.v v0, v7 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: li a1, 24 ; CHECK-RV64-NEXT: mul a0, a0, a1 @@ -803,11 +754,6 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x ; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 4 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV64-NEXT: addi a0, sp, 16 @@ -990,13 +936,13 @@ define <32 x i32> @test_expandload_v32i32(ptr %base, <32 x i1> %mask, <32 x i32> ; CHECK-LABEL: test_expandload_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: viota.m v16, v0 ; CHECK-NEXT: vcpop.m a2, v0 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a0) ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: viota.m v24, v0 -; CHECK-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-NEXT: vrgather.vv v8, v24, v16, v0.t ; CHECK-NEXT: ret %res = call <32 x i32> @llvm.masked.expandload.v32i32(ptr align 4 %base, <32 x i1> %mask, <32 x i32> %passthru) ret <32 x i32> %res @@ -1023,50 +969,58 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32> ; CHECK-RV32-NEXT: sub sp, sp, a1 ; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 3 +; CHECK-RV32-NEXT: slli a1, a1, 4 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: li a1, 32 -; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 4 -; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: li a3, 24 +; CHECK-RV32-NEXT: mul a2, a2, a3 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: addi a2, a2, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vcpop.m a2, v0 -; CHECK-RV32-NEXT: vcpop.m a3, v7 ; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-RV32-NEXT: vle32.v v24, (a0) -; CHECK-RV32-NEXT: csrr a4, vlenb -; CHECK-RV32-NEXT: slli a4, a4, 4 -; CHECK-RV32-NEXT: add a4, sp, a4 -; CHECK-RV32-NEXT: addi a4, a4, 16 -; CHECK-RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: csrr a3, vlenb +; CHECK-RV32-NEXT: li a4, 24 +; CHECK-RV32-NEXT: mul a3, a3, a4 +; CHECK-RV32-NEXT: add a3, sp, a3 +; CHECK-RV32-NEXT: addi a3, a3, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: addi a3, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v0, v0, 4 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-RV32-NEXT: vcpop.m a3, v0 ; CHECK-RV32-NEXT: slli a2, a2, 2 ; CHECK-RV32-NEXT: add a0, a0, a2 ; CHECK-RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-RV32-NEXT: vle32.v v24, (a0) +; CHECK-RV32-NEXT: vle32.v v16, (a0) ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: li a2, 24 ; CHECK-RV32-NEXT: mul a0, a0, a2 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v24, v0 +; CHECK-RV32-NEXT: viota.m v16, v0 ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v16, v7 +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: vmv1r.v v0, v7 +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: li a1, 24 ; CHECK-RV32-NEXT: mul a0, a0, a1 @@ -1077,11 +1031,6 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32> ; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 4 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV32-NEXT: addi a0, sp, 16 @@ -1108,55 +1057,68 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32> ; CHECK-RV64-NEXT: addi a1, a1, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: li a1, 32 +; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: li a3, 24 +; CHECK-RV64-NEXT: mul a2, a2, a3 +; CHECK-RV64-NEXT: add a2, sp, a2 +; CHECK-RV64-NEXT: addi a2, a2, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vcpop.m a2, v0 +; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-RV64-NEXT: vle32.v v24, (a0) +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: li a3, 24 +; CHECK-RV64-NEXT: mul a2, a2, a3 +; CHECK-RV64-NEXT: add a2, sp, a2 +; CHECK-RV64-NEXT: addi a2, a2, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV64-NEXT: addi a2, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 4 +; CHECK-RV64-NEXT: vslidedown.vi v24, v0, 4 ; CHECK-RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-RV64-NEXT: vmv.x.s a2, v0 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-RV64-NEXT: vcpop.m a3, v0 -; CHECK-RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-RV64-NEXT: vle32.v v24, (a0) -; CHECK-RV64-NEXT: csrr a3, vlenb -; CHECK-RV64-NEXT: li a4, 24 -; CHECK-RV64-NEXT: mul a3, a3, a4 -; CHECK-RV64-NEXT: add a3, sp, a3 -; CHECK-RV64-NEXT: addi a3, a3, 16 -; CHECK-RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-RV64-NEXT: vcpop.m a3, v7 +; CHECK-RV64-NEXT: vcpop.m a3, v24 ; CHECK-RV64-NEXT: cpopw a2, a2 ; CHECK-RV64-NEXT: slli a2, a2, 2 ; CHECK-RV64-NEXT: add a0, a0, a2 ; CHECK-RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-RV64-NEXT: vle32.v v16, (a0) ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: li a2, 24 +; CHECK-RV64-NEXT: mul a0, a0, a2 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v24, v0 +; CHECK-RV64-NEXT: viota.m v16, v24 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: slli a0, a0, 3 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v8, v7 -; CHECK-RV64-NEXT: vmv1r.v v0, v7 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vmv1r.v v0, v24 ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -1329,33 +1291,34 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV32-NEXT: addi sp, sp, -16 ; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 5 +; CHECK-RV32-NEXT: li a2, 24 +; CHECK-RV32-NEXT: mul a1, a1, a2 ; CHECK-RV32-NEXT: sub sp, sp, a1 -; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-RV32-NEXT: csrr a1, vlenb ; CHECK-RV32-NEXT: slli a1, a1, 4 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-RV32-NEXT: vcpop.m a1, v0 -; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-RV32-NEXT: vle64.v v24, (a0) -; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: li a2, 24 -; CHECK-RV32-NEXT: mul a1, a1, a2 -; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: addi a1, a1, 16 -; CHECK-RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: addi a1, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-RV32-NEXT: vmv.x.s a1, v0 ; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-RV32-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-RV32-NEXT: zext.h a1, a1 ; CHECK-RV32-NEXT: cpop a1, a1 ; CHECK-RV32-NEXT: slli a1, a1, 3 ; CHECK-RV32-NEXT: add a0, a0, a1 ; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-RV32-NEXT: vcpop.m a1, v7 +; CHECK-RV32-NEXT: vcpop.m a1, v0 ; CHECK-RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-RV32-NEXT: vle64.v v16, (a0) ; CHECK-RV32-NEXT: csrr a0, vlenb @@ -1364,18 +1327,7 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-RV32-NEXT: viota.m v24, v0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v8, v7 -; CHECK-RV32-NEXT: vmv1r.v v0, v7 +; CHECK-RV32-NEXT: viota.m v8, v0 ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 @@ -1390,7 +1342,8 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 ; CHECK-RV32-NEXT: add sp, sp, a0 ; CHECK-RV32-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV32-NEXT: addi sp, sp, 16 @@ -1402,33 +1355,34 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV64-NEXT: addi sp, sp, -16 ; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: slli a1, a1, 5 +; CHECK-RV64-NEXT: li a2, 24 +; CHECK-RV64-NEXT: mul a1, a1, a2 ; CHECK-RV64-NEXT: sub sp, sp, a1 -; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-RV64-NEXT: csrr a1, vlenb ; CHECK-RV64-NEXT: slli a1, a1, 4 ; CHECK-RV64-NEXT: add a1, sp, a1 ; CHECK-RV64-NEXT: addi a1, a1, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-RV64-NEXT: vcpop.m a1, v0 -; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-RV64-NEXT: vle64.v v24, (a0) -; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: li a2, 24 -; CHECK-RV64-NEXT: mul a1, a1, a2 -; CHECK-RV64-NEXT: add a1, sp, a1 -; CHECK-RV64-NEXT: addi a1, a1, 16 -; CHECK-RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV64-NEXT: addi a1, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-RV64-NEXT: vmv.x.s a1, v0 ; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-RV64-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-RV64-NEXT: zext.h a1, a1 ; CHECK-RV64-NEXT: cpopw a1, a1 ; CHECK-RV64-NEXT: slli a1, a1, 3 ; CHECK-RV64-NEXT: add a0, a0, a1 ; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-RV64-NEXT: vcpop.m a1, v7 +; CHECK-RV64-NEXT: vcpop.m a1, v0 ; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-RV64-NEXT: vle64.v v16, (a0) ; CHECK-RV64-NEXT: csrr a0, vlenb @@ -1437,18 +1391,7 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-RV64-NEXT: viota.m v24, v0 -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 -; CHECK-RV64-NEXT: add a0, sp, a0 -; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v8, v7 -; CHECK-RV64-NEXT: vmv1r.v v0, v7 +; CHECK-RV64-NEXT: viota.m v8, v0 ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 @@ -1463,7 +1406,8 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> ; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 5 +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 ; CHECK-RV64-NEXT: add sp, sp, a0 ; CHECK-RV64-NEXT: .cfi_def_cfa sp, 16 ; CHECK-RV64-NEXT: addi sp, sp, 16 @@ -1491,13 +1435,12 @@ define <512 x i8> @test_expandload_v512i8(ptr %base, <512 x i1> %mask, <512 x i8 ; CHECK-LABEL: test_expandload_v512i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 512 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: viota.m v16, v0 ; CHECK-NEXT: vcpop.m a2, v0 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; CHECK-NEXT: viota.m v16, v0 -; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, mu +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu ; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t ; CHECK-NEXT: ret %res = call <512 x i8> @llvm.masked.expandload.v512i8(ptr align 1 %base, <512 x i1> %mask, <512 x i8> %passthru) @@ -1630,12 +1573,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 28 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: .LBB61_30: # %else110 -; CHECK-RV32-NEXT: slli a2, a3, 2 ; CHECK-RV32-NEXT: li a1, 32 +; CHECK-RV32-NEXT: slli a2, a3, 2 ; CHECK-RV32-NEXT: bgez a2, .LBB61_32 ; CHECK-RV32-NEXT: # %bb.31: # %cond.load113 ; CHECK-RV32-NEXT: lbu a2, 0(a0) @@ -1643,13 +1586,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a2 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 29 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: .LBB61_32: # %else114 -; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v16, v0, a1 +; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: bgez a2, .LBB61_34 ; CHECK-RV32-NEXT: # %bb.33: # %cond.load117 ; CHECK-RV32-NEXT: lbu a2, 0(a0) @@ -1657,8 +1600,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv.s.x v9, a2 ; CHECK-RV32-NEXT: vsetivli zero, 31, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 30 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_34: # %else118 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -1793,13 +1736,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 61 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: .LBB61_66: # %else242 -; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: bgez a3, .LBB61_68 ; CHECK-RV32-NEXT: # %bb.67: # %cond.load245 ; CHECK-RV32-NEXT: lbu a3, 0(a0) @@ -1809,8 +1752,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 62 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_68: # %else246 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -1945,13 +1888,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 93 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_100: # %else370 -; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: bgez a2, .LBB61_102 ; CHECK-RV32-NEXT: # %bb.101: # %cond.load373 ; CHECK-RV32-NEXT: lbu a2, 0(a0) @@ -1961,8 +1904,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 94 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_102: # %else374 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2097,13 +2040,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 125 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: .LBB61_134: # %else498 -; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 2 +; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: bgez a3, .LBB61_136 ; CHECK-RV32-NEXT: # %bb.135: # %cond.load501 ; CHECK-RV32-NEXT: lbu a3, 0(a0) @@ -2113,8 +2056,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 126 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_136: # %else502 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2249,13 +2192,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 157 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_168: # %else626 -; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: bgez a2, .LBB61_170 ; CHECK-RV32-NEXT: # %bb.169: # %cond.load629 ; CHECK-RV32-NEXT: lbu a2, 0(a0) @@ -2265,8 +2208,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 158 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_170: # %else630 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2401,13 +2344,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 189 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: .LBB61_202: # %else754 -; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 3 +; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: bgez a3, .LBB61_204 ; CHECK-RV32-NEXT: # %bb.203: # %cond.load757 ; CHECK-RV32-NEXT: lbu a3, 0(a0) @@ -2417,8 +2360,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 190 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_204: # %else758 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2553,13 +2496,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 221 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_236: # %else882 -; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: bgez a2, .LBB61_238 ; CHECK-RV32-NEXT: # %bb.237: # %cond.load885 ; CHECK-RV32-NEXT: lbu a2, 0(a0) @@ -2569,8 +2512,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 222 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_238: # %else886 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2705,13 +2648,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 253 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: .LBB61_270: # %else1010 -; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 4 +; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: bgez a3, .LBB61_272 ; CHECK-RV32-NEXT: # %bb.271: # %cond.load1013 ; CHECK-RV32-NEXT: lbu a3, 0(a0) @@ -2721,8 +2664,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 254 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: .LBB61_272: # %else1014 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2859,9 +2802,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: .LBB61_304: # %else1138 -; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: bgez a2, .LBB61_306 ; CHECK-RV32-NEXT: # %bb.305: # %cond.load1141 ; CHECK-RV32-NEXT: lbu a2, 0(a0) @@ -3006,9 +2949,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: .LBB61_338: # %else1266 -; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 5 +; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: bgez a3, .LBB61_340 ; CHECK-RV32-NEXT: # %bb.339: # %cond.load1269 ; CHECK-RV32-NEXT: lbu a3, 0(a0) @@ -3153,9 +3096,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: .LBB61_372: # %else1394 -; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: bgez a2, .LBB61_374 ; CHECK-RV32-NEXT: # %bb.373: # %cond.load1397 ; CHECK-RV32-NEXT: lbu a2, 0(a0) @@ -3300,9 +3243,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: .LBB61_406: # %else1522 -; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 6 +; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: bgez a3, .LBB61_408 ; CHECK-RV32-NEXT: # %bb.407: # %cond.load1525 ; CHECK-RV32-NEXT: lbu a3, 0(a0) @@ -3447,9 +3390,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: .LBB61_440: # %else1650 -; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: bgez a2, .LBB61_442 ; CHECK-RV32-NEXT: # %bb.441: # %cond.load1653 ; CHECK-RV32-NEXT: lbu a2, 0(a0) @@ -3594,9 +3537,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: .LBB61_474: # %else1778 -; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 7 +; CHECK-RV32-NEXT: slli a3, a2, 1 ; CHECK-RV32-NEXT: bgez a3, .LBB61_476 ; CHECK-RV32-NEXT: # %bb.475: # %cond.load1781 ; CHECK-RV32-NEXT: lbu a3, 0(a0) @@ -3741,10 +3684,10 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 ; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: .LBB61_508: # %else1906 -; CHECK-RV32-NEXT: slli a2, a3, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 -; CHECK-RV32-NEXT: bgez a2, .LBB61_510 +; CHECK-RV32-NEXT: slli a1, a3, 1 +; CHECK-RV32-NEXT: bgez a1, .LBB61_510 ; CHECK-RV32-NEXT: # %bb.509: # %cond.load1909 ; CHECK-RV32-NEXT: lbu a1, 0(a0) ; CHECK-RV32-NEXT: vmv.s.x v24, a1 @@ -3892,8 +3835,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vsetvli zero, zero, e8, mf8, tu, ma ; CHECK-RV32-NEXT: vmv.s.x v8, a1 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 2 ; CHECK-RV32-NEXT: bnez a1, .LBB61_545 @@ -3904,8 +3847,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 1 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 4 ; CHECK-RV32-NEXT: bnez a1, .LBB61_546 @@ -3916,8 +3859,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 2 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 8 ; CHECK-RV32-NEXT: bnez a1, .LBB61_547 @@ -3928,8 +3871,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 3 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 16 ; CHECK-RV32-NEXT: bnez a1, .LBB61_548 @@ -3940,8 +3883,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 32 ; CHECK-RV32-NEXT: bnez a1, .LBB61_549 @@ -3952,8 +3895,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 5 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 64 ; CHECK-RV32-NEXT: bnez a1, .LBB61_550 @@ -3964,8 +3907,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 6 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 128 ; CHECK-RV32-NEXT: bnez a1, .LBB61_551 @@ -3976,8 +3919,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 7 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 256 ; CHECK-RV32-NEXT: bnez a1, .LBB61_552 @@ -3988,8 +3931,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 8 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 512 ; CHECK-RV32-NEXT: bnez a1, .LBB61_553 @@ -4000,8 +3943,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 9 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a1, a3, 1024 ; CHECK-RV32-NEXT: bnez a1, .LBB61_554 @@ -4012,8 +3955,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 10 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 20 ; CHECK-RV32-NEXT: bltz a1, .LBB61_555 @@ -4024,8 +3967,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 11 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 19 ; CHECK-RV32-NEXT: bltz a1, .LBB61_556 @@ -4036,8 +3979,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 12 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 18 ; CHECK-RV32-NEXT: bltz a1, .LBB61_557 @@ -4048,8 +3991,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 13 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 17 ; CHECK-RV32-NEXT: bltz a1, .LBB61_558 @@ -4060,8 +4003,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 14 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 16 ; CHECK-RV32-NEXT: bltz a1, .LBB61_559 @@ -4072,8 +4015,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 15 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 15 ; CHECK-RV32-NEXT: bltz a1, .LBB61_560 @@ -4084,8 +4027,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 16 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 14 ; CHECK-RV32-NEXT: bltz a1, .LBB61_561 @@ -4096,8 +4039,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 17 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 13 ; CHECK-RV32-NEXT: bltz a1, .LBB61_562 @@ -4108,8 +4051,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 18 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 12 ; CHECK-RV32-NEXT: bltz a1, .LBB61_563 @@ -4120,8 +4063,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 19 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 11 ; CHECK-RV32-NEXT: bltz a1, .LBB61_564 @@ -4132,8 +4075,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 20 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 10 ; CHECK-RV32-NEXT: bltz a1, .LBB61_565 @@ -4144,8 +4087,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 21 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 9 ; CHECK-RV32-NEXT: bltz a1, .LBB61_566 @@ -4156,8 +4099,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 22 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 8 ; CHECK-RV32-NEXT: bltz a1, .LBB61_567 @@ -4168,8 +4111,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 23 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 7 ; CHECK-RV32-NEXT: bltz a1, .LBB61_568 @@ -4180,8 +4123,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 24 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 6 ; CHECK-RV32-NEXT: bltz a1, .LBB61_569 @@ -4192,8 +4135,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 25 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 5 ; CHECK-RV32-NEXT: bltz a1, .LBB61_570 @@ -4204,8 +4147,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 26 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 4 ; CHECK-RV32-NEXT: bltz a1, .LBB61_571 @@ -4216,8 +4159,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: vmv8r.v v16, v8 ; CHECK-RV32-NEXT: vmv.s.x v9, a1 ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 27 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a1, a3, 3 ; CHECK-RV32-NEXT: bgez a1, .LBB61_1025 @@ -4231,8 +4174,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a3, 32 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vi v8, v9, 31 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1 ; CHECK-RV32-NEXT: bnez a3, .LBB61_573 @@ -4246,8 +4189,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 32 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 2 ; CHECK-RV32-NEXT: bnez a3, .LBB61_574 @@ -4261,8 +4204,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 33 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 4 ; CHECK-RV32-NEXT: bnez a3, .LBB61_575 @@ -4276,8 +4219,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 34 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 8 ; CHECK-RV32-NEXT: bnez a3, .LBB61_576 @@ -4291,8 +4234,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 35 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 16 ; CHECK-RV32-NEXT: bnez a3, .LBB61_577 @@ -4306,8 +4249,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 36 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 32 ; CHECK-RV32-NEXT: bnez a3, .LBB61_578 @@ -4321,8 +4264,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 37 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 64 ; CHECK-RV32-NEXT: bnez a3, .LBB61_579 @@ -4336,8 +4279,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 38 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 128 ; CHECK-RV32-NEXT: bnez a3, .LBB61_580 @@ -4351,8 +4294,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 39 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 256 ; CHECK-RV32-NEXT: bnez a3, .LBB61_581 @@ -4366,8 +4309,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 40 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 512 ; CHECK-RV32-NEXT: bnez a3, .LBB61_582 @@ -4381,8 +4324,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 41 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1024 ; CHECK-RV32-NEXT: bnez a3, .LBB61_583 @@ -4396,8 +4339,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 42 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 20 ; CHECK-RV32-NEXT: bltz a3, .LBB61_584 @@ -4411,8 +4354,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 43 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 19 ; CHECK-RV32-NEXT: bltz a3, .LBB61_585 @@ -4426,8 +4369,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 44 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 18 ; CHECK-RV32-NEXT: bltz a3, .LBB61_586 @@ -4441,8 +4384,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 45 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 17 ; CHECK-RV32-NEXT: bltz a3, .LBB61_587 @@ -4456,8 +4399,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 46 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 16 ; CHECK-RV32-NEXT: bltz a3, .LBB61_588 @@ -4471,8 +4414,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 47 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 15 ; CHECK-RV32-NEXT: bltz a3, .LBB61_589 @@ -4486,8 +4429,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 48 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 14 ; CHECK-RV32-NEXT: bltz a3, .LBB61_590 @@ -4501,8 +4444,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 49 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 13 ; CHECK-RV32-NEXT: bltz a3, .LBB61_591 @@ -4516,8 +4459,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 50 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 12 ; CHECK-RV32-NEXT: bltz a3, .LBB61_592 @@ -4531,8 +4474,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 51 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 11 ; CHECK-RV32-NEXT: bltz a3, .LBB61_593 @@ -4546,8 +4489,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 52 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 10 ; CHECK-RV32-NEXT: bltz a3, .LBB61_594 @@ -4561,8 +4504,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 53 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 9 ; CHECK-RV32-NEXT: bltz a3, .LBB61_595 @@ -4576,8 +4519,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 54 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 8 ; CHECK-RV32-NEXT: bltz a3, .LBB61_596 @@ -4591,8 +4534,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 55 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 7 ; CHECK-RV32-NEXT: bltz a3, .LBB61_597 @@ -4606,8 +4549,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 56 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 6 ; CHECK-RV32-NEXT: bltz a3, .LBB61_598 @@ -4621,8 +4564,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 57 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 5 ; CHECK-RV32-NEXT: bltz a3, .LBB61_599 @@ -4636,8 +4579,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 58 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 4 ; CHECK-RV32-NEXT: bltz a3, .LBB61_600 @@ -4651,8 +4594,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 59 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 3 ; CHECK-RV32-NEXT: bltz a3, .LBB61_601 @@ -4666,8 +4609,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 60 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 2 ; CHECK-RV32-NEXT: bgez a3, .LBB61_1026 @@ -4682,8 +4625,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 63 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m1, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v9, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 1 ; CHECK-RV32-NEXT: bnez a2, .LBB61_603 @@ -4697,8 +4640,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 64 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 2 ; CHECK-RV32-NEXT: bnez a2, .LBB61_604 @@ -4712,8 +4655,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 65 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 4 ; CHECK-RV32-NEXT: bnez a2, .LBB61_605 @@ -4727,8 +4670,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 66 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 8 ; CHECK-RV32-NEXT: bnez a2, .LBB61_606 @@ -4742,8 +4685,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 67 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 16 ; CHECK-RV32-NEXT: bnez a2, .LBB61_607 @@ -4757,8 +4700,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 68 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 32 ; CHECK-RV32-NEXT: bnez a2, .LBB61_608 @@ -4772,8 +4715,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 69 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 64 ; CHECK-RV32-NEXT: bnez a2, .LBB61_609 @@ -4787,8 +4730,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 70 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 128 ; CHECK-RV32-NEXT: bnez a2, .LBB61_610 @@ -4802,8 +4745,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 71 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 256 ; CHECK-RV32-NEXT: bnez a2, .LBB61_611 @@ -4817,8 +4760,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 72 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 512 ; CHECK-RV32-NEXT: bnez a2, .LBB61_612 @@ -4832,8 +4775,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 73 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 1024 ; CHECK-RV32-NEXT: bnez a2, .LBB61_613 @@ -4847,8 +4790,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 74 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 20 ; CHECK-RV32-NEXT: bltz a2, .LBB61_614 @@ -4862,8 +4805,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 75 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 19 ; CHECK-RV32-NEXT: bltz a2, .LBB61_615 @@ -4877,8 +4820,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 76 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 18 ; CHECK-RV32-NEXT: bltz a2, .LBB61_616 @@ -4892,8 +4835,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 77 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 17 ; CHECK-RV32-NEXT: bltz a2, .LBB61_617 @@ -4907,8 +4850,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 78 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 16 ; CHECK-RV32-NEXT: bltz a2, .LBB61_618 @@ -4922,8 +4865,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 79 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 15 ; CHECK-RV32-NEXT: bltz a2, .LBB61_619 @@ -4937,8 +4880,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 80 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 14 ; CHECK-RV32-NEXT: bltz a2, .LBB61_620 @@ -4952,8 +4895,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 81 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 13 ; CHECK-RV32-NEXT: bltz a2, .LBB61_621 @@ -4967,8 +4910,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 82 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 12 ; CHECK-RV32-NEXT: bltz a2, .LBB61_622 @@ -4982,8 +4925,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 83 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 11 ; CHECK-RV32-NEXT: bltz a2, .LBB61_623 @@ -4997,8 +4940,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 84 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 10 ; CHECK-RV32-NEXT: bltz a2, .LBB61_624 @@ -5012,8 +4955,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 85 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 9 ; CHECK-RV32-NEXT: bltz a2, .LBB61_625 @@ -5027,8 +4970,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 86 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 8 ; CHECK-RV32-NEXT: bltz a2, .LBB61_626 @@ -5042,8 +4985,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 87 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 7 ; CHECK-RV32-NEXT: bltz a2, .LBB61_627 @@ -5057,8 +5000,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 88 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 6 ; CHECK-RV32-NEXT: bltz a2, .LBB61_628 @@ -5072,8 +5015,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 89 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 5 ; CHECK-RV32-NEXT: bltz a2, .LBB61_629 @@ -5087,8 +5030,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 90 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 4 ; CHECK-RV32-NEXT: bltz a2, .LBB61_630 @@ -5102,8 +5045,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 91 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 3 ; CHECK-RV32-NEXT: bltz a2, .LBB61_631 @@ -5117,8 +5060,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 92 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 2 ; CHECK-RV32-NEXT: bgez a2, .LBB61_1027 @@ -5133,8 +5076,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 95 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1 ; CHECK-RV32-NEXT: bnez a3, .LBB61_633 @@ -5148,8 +5091,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 96 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 2 ; CHECK-RV32-NEXT: bnez a3, .LBB61_634 @@ -5163,8 +5106,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 97 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 4 ; CHECK-RV32-NEXT: bnez a3, .LBB61_635 @@ -5178,8 +5121,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 98 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 8 ; CHECK-RV32-NEXT: bnez a3, .LBB61_636 @@ -5193,8 +5136,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 99 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 16 ; CHECK-RV32-NEXT: bnez a3, .LBB61_637 @@ -5208,8 +5151,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 100 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 32 ; CHECK-RV32-NEXT: bnez a3, .LBB61_638 @@ -5223,8 +5166,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 101 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 64 ; CHECK-RV32-NEXT: bnez a3, .LBB61_639 @@ -5238,8 +5181,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 102 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 128 ; CHECK-RV32-NEXT: bnez a3, .LBB61_640 @@ -5253,8 +5196,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 103 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 256 ; CHECK-RV32-NEXT: bnez a3, .LBB61_641 @@ -5268,8 +5211,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 104 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 512 ; CHECK-RV32-NEXT: bnez a3, .LBB61_642 @@ -5283,8 +5226,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 105 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1024 ; CHECK-RV32-NEXT: bnez a3, .LBB61_643 @@ -5298,8 +5241,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 106 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 20 ; CHECK-RV32-NEXT: bltz a3, .LBB61_644 @@ -5313,8 +5256,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 107 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 19 ; CHECK-RV32-NEXT: bltz a3, .LBB61_645 @@ -5328,8 +5271,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 108 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 18 ; CHECK-RV32-NEXT: bltz a3, .LBB61_646 @@ -5343,8 +5286,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 109 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 17 ; CHECK-RV32-NEXT: bltz a3, .LBB61_647 @@ -5358,8 +5301,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 110 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 16 ; CHECK-RV32-NEXT: bltz a3, .LBB61_648 @@ -5373,8 +5316,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 111 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 15 ; CHECK-RV32-NEXT: bltz a3, .LBB61_649 @@ -5388,8 +5331,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 112 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 14 ; CHECK-RV32-NEXT: bltz a3, .LBB61_650 @@ -5403,8 +5346,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 113 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 13 ; CHECK-RV32-NEXT: bltz a3, .LBB61_651 @@ -5418,8 +5361,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 114 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 12 ; CHECK-RV32-NEXT: bltz a3, .LBB61_652 @@ -5433,8 +5376,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 115 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 11 ; CHECK-RV32-NEXT: bltz a3, .LBB61_653 @@ -5448,8 +5391,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 116 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 10 ; CHECK-RV32-NEXT: bltz a3, .LBB61_654 @@ -5463,8 +5406,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 117 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 9 ; CHECK-RV32-NEXT: bltz a3, .LBB61_655 @@ -5478,8 +5421,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 118 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 8 ; CHECK-RV32-NEXT: bltz a3, .LBB61_656 @@ -5493,8 +5436,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 119 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 7 ; CHECK-RV32-NEXT: bltz a3, .LBB61_657 @@ -5508,8 +5451,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 120 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 6 ; CHECK-RV32-NEXT: bltz a3, .LBB61_658 @@ -5523,8 +5466,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 121 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 5 ; CHECK-RV32-NEXT: bltz a3, .LBB61_659 @@ -5538,8 +5481,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 122 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 4 ; CHECK-RV32-NEXT: bltz a3, .LBB61_660 @@ -5553,8 +5496,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 123 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 3 ; CHECK-RV32-NEXT: bltz a3, .LBB61_661 @@ -5568,8 +5511,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 124 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 2 ; CHECK-RV32-NEXT: bgez a3, .LBB61_1028 @@ -5584,8 +5527,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 127 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v10, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 1 ; CHECK-RV32-NEXT: bnez a2, .LBB61_663 @@ -5599,8 +5542,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 128 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 2 ; CHECK-RV32-NEXT: bnez a2, .LBB61_664 @@ -5614,8 +5557,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 129 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 4 ; CHECK-RV32-NEXT: bnez a2, .LBB61_665 @@ -5629,8 +5572,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 130 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 8 ; CHECK-RV32-NEXT: bnez a2, .LBB61_666 @@ -5644,8 +5587,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 131 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 16 ; CHECK-RV32-NEXT: bnez a2, .LBB61_667 @@ -5659,8 +5602,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 132 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 32 ; CHECK-RV32-NEXT: bnez a2, .LBB61_668 @@ -5674,8 +5617,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 133 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 64 ; CHECK-RV32-NEXT: bnez a2, .LBB61_669 @@ -5689,8 +5632,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 134 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 128 ; CHECK-RV32-NEXT: bnez a2, .LBB61_670 @@ -5704,8 +5647,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 135 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 256 ; CHECK-RV32-NEXT: bnez a2, .LBB61_671 @@ -5719,8 +5662,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 136 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 512 ; CHECK-RV32-NEXT: bnez a2, .LBB61_672 @@ -5734,8 +5677,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 137 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 1024 ; CHECK-RV32-NEXT: bnez a2, .LBB61_673 @@ -5749,8 +5692,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 138 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 20 ; CHECK-RV32-NEXT: bltz a2, .LBB61_674 @@ -5764,8 +5707,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 139 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 19 ; CHECK-RV32-NEXT: bltz a2, .LBB61_675 @@ -5779,8 +5722,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 140 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 18 ; CHECK-RV32-NEXT: bltz a2, .LBB61_676 @@ -5794,8 +5737,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 141 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 17 ; CHECK-RV32-NEXT: bltz a2, .LBB61_677 @@ -5809,8 +5752,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 142 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 16 ; CHECK-RV32-NEXT: bltz a2, .LBB61_678 @@ -5824,8 +5767,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 143 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 15 ; CHECK-RV32-NEXT: bltz a2, .LBB61_679 @@ -5839,8 +5782,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 144 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 14 ; CHECK-RV32-NEXT: bltz a2, .LBB61_680 @@ -5854,8 +5797,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 145 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 13 ; CHECK-RV32-NEXT: bltz a2, .LBB61_681 @@ -5869,8 +5812,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 146 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 12 ; CHECK-RV32-NEXT: bltz a2, .LBB61_682 @@ -5884,8 +5827,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 147 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 11 ; CHECK-RV32-NEXT: bltz a2, .LBB61_683 @@ -5899,8 +5842,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 148 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 10 ; CHECK-RV32-NEXT: bltz a2, .LBB61_684 @@ -5914,8 +5857,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 149 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 9 ; CHECK-RV32-NEXT: bltz a2, .LBB61_685 @@ -5929,8 +5872,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 150 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 8 ; CHECK-RV32-NEXT: bltz a2, .LBB61_686 @@ -5944,8 +5887,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 151 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 7 ; CHECK-RV32-NEXT: bltz a2, .LBB61_687 @@ -5959,8 +5902,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 152 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 6 ; CHECK-RV32-NEXT: bltz a2, .LBB61_688 @@ -5974,8 +5917,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 153 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 5 ; CHECK-RV32-NEXT: bltz a2, .LBB61_689 @@ -5989,8 +5932,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 154 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 4 ; CHECK-RV32-NEXT: bltz a2, .LBB61_690 @@ -6004,8 +5947,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 155 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 3 ; CHECK-RV32-NEXT: bltz a2, .LBB61_691 @@ -6019,8 +5962,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 156 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 2 ; CHECK-RV32-NEXT: bgez a2, .LBB61_1029 @@ -6035,8 +5978,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 159 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1 ; CHECK-RV32-NEXT: bnez a3, .LBB61_693 @@ -6050,8 +5993,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 160 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 2 ; CHECK-RV32-NEXT: bnez a3, .LBB61_694 @@ -6065,8 +6008,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 161 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 4 ; CHECK-RV32-NEXT: bnez a3, .LBB61_695 @@ -6080,8 +6023,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 162 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 8 ; CHECK-RV32-NEXT: bnez a3, .LBB61_696 @@ -6095,8 +6038,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 163 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 16 ; CHECK-RV32-NEXT: bnez a3, .LBB61_697 @@ -6110,8 +6053,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 164 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 32 ; CHECK-RV32-NEXT: bnez a3, .LBB61_698 @@ -6125,8 +6068,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 165 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 64 ; CHECK-RV32-NEXT: bnez a3, .LBB61_699 @@ -6140,8 +6083,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 166 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 128 ; CHECK-RV32-NEXT: bnez a3, .LBB61_700 @@ -6155,8 +6098,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 167 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 256 ; CHECK-RV32-NEXT: bnez a3, .LBB61_701 @@ -6170,8 +6113,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 168 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 512 ; CHECK-RV32-NEXT: bnez a3, .LBB61_702 @@ -6185,8 +6128,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 169 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1024 ; CHECK-RV32-NEXT: bnez a3, .LBB61_703 @@ -6200,8 +6143,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 170 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 20 ; CHECK-RV32-NEXT: bltz a3, .LBB61_704 @@ -6215,8 +6158,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 171 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 19 ; CHECK-RV32-NEXT: bltz a3, .LBB61_705 @@ -6230,8 +6173,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 172 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 18 ; CHECK-RV32-NEXT: bltz a3, .LBB61_706 @@ -6245,8 +6188,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 173 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 17 ; CHECK-RV32-NEXT: bltz a3, .LBB61_707 @@ -6260,8 +6203,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 174 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 16 ; CHECK-RV32-NEXT: bltz a3, .LBB61_708 @@ -6275,8 +6218,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 175 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 15 ; CHECK-RV32-NEXT: bltz a3, .LBB61_709 @@ -6290,8 +6233,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 176 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 14 ; CHECK-RV32-NEXT: bltz a3, .LBB61_710 @@ -6305,8 +6248,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 177 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 13 ; CHECK-RV32-NEXT: bltz a3, .LBB61_711 @@ -6320,8 +6263,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 178 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 12 ; CHECK-RV32-NEXT: bltz a3, .LBB61_712 @@ -6335,8 +6278,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 179 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 11 ; CHECK-RV32-NEXT: bltz a3, .LBB61_713 @@ -6350,8 +6293,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 180 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 10 ; CHECK-RV32-NEXT: bltz a3, .LBB61_714 @@ -6365,8 +6308,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 181 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 9 ; CHECK-RV32-NEXT: bltz a3, .LBB61_715 @@ -6380,8 +6323,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 182 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 8 ; CHECK-RV32-NEXT: bltz a3, .LBB61_716 @@ -6395,8 +6338,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 183 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 7 ; CHECK-RV32-NEXT: bltz a3, .LBB61_717 @@ -6410,8 +6353,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 184 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 6 ; CHECK-RV32-NEXT: bltz a3, .LBB61_718 @@ -6425,8 +6368,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 185 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 5 ; CHECK-RV32-NEXT: bltz a3, .LBB61_719 @@ -6440,8 +6383,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 186 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 4 ; CHECK-RV32-NEXT: bltz a3, .LBB61_720 @@ -6455,8 +6398,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 187 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 3 ; CHECK-RV32-NEXT: bltz a3, .LBB61_721 @@ -6470,8 +6413,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 188 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 2 ; CHECK-RV32-NEXT: bgez a3, .LBB61_1030 @@ -6486,8 +6429,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 191 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 1 ; CHECK-RV32-NEXT: bnez a2, .LBB61_723 @@ -6501,8 +6444,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 192 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 2 ; CHECK-RV32-NEXT: bnez a2, .LBB61_724 @@ -6516,8 +6459,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 193 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 4 ; CHECK-RV32-NEXT: bnez a2, .LBB61_725 @@ -6531,8 +6474,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 194 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 8 ; CHECK-RV32-NEXT: bnez a2, .LBB61_726 @@ -6546,8 +6489,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 195 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 16 ; CHECK-RV32-NEXT: bnez a2, .LBB61_727 @@ -6561,8 +6504,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 196 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 32 ; CHECK-RV32-NEXT: bnez a2, .LBB61_728 @@ -6576,8 +6519,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 197 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 64 ; CHECK-RV32-NEXT: bnez a2, .LBB61_729 @@ -6591,8 +6534,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 198 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 128 ; CHECK-RV32-NEXT: bnez a2, .LBB61_730 @@ -6606,8 +6549,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 199 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 256 ; CHECK-RV32-NEXT: bnez a2, .LBB61_731 @@ -6621,8 +6564,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 200 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 512 ; CHECK-RV32-NEXT: bnez a2, .LBB61_732 @@ -6636,8 +6579,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 201 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 1024 ; CHECK-RV32-NEXT: bnez a2, .LBB61_733 @@ -6651,8 +6594,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 202 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 20 ; CHECK-RV32-NEXT: bltz a2, .LBB61_734 @@ -6666,8 +6609,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 203 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 19 ; CHECK-RV32-NEXT: bltz a2, .LBB61_735 @@ -6681,8 +6624,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 204 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 18 ; CHECK-RV32-NEXT: bltz a2, .LBB61_736 @@ -6696,8 +6639,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 205 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 17 ; CHECK-RV32-NEXT: bltz a2, .LBB61_737 @@ -6711,8 +6654,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 206 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 16 ; CHECK-RV32-NEXT: bltz a2, .LBB61_738 @@ -6726,8 +6669,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 207 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 15 ; CHECK-RV32-NEXT: bltz a2, .LBB61_739 @@ -6741,8 +6684,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 208 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 14 ; CHECK-RV32-NEXT: bltz a2, .LBB61_740 @@ -6756,8 +6699,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 209 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 13 ; CHECK-RV32-NEXT: bltz a2, .LBB61_741 @@ -6771,8 +6714,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 210 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 12 ; CHECK-RV32-NEXT: bltz a2, .LBB61_742 @@ -6786,8 +6729,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 211 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 11 ; CHECK-RV32-NEXT: bltz a2, .LBB61_743 @@ -6801,8 +6744,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 212 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 10 ; CHECK-RV32-NEXT: bltz a2, .LBB61_744 @@ -6816,8 +6759,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 213 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 9 ; CHECK-RV32-NEXT: bltz a2, .LBB61_745 @@ -6831,8 +6774,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 214 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 8 ; CHECK-RV32-NEXT: bltz a2, .LBB61_746 @@ -6846,8 +6789,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 215 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 7 ; CHECK-RV32-NEXT: bltz a2, .LBB61_747 @@ -6861,8 +6804,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 216 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 6 ; CHECK-RV32-NEXT: bltz a2, .LBB61_748 @@ -6876,8 +6819,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 217 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 5 ; CHECK-RV32-NEXT: bltz a2, .LBB61_749 @@ -6891,8 +6834,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 218 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 4 ; CHECK-RV32-NEXT: bltz a2, .LBB61_750 @@ -6906,8 +6849,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 219 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 3 ; CHECK-RV32-NEXT: bltz a2, .LBB61_751 @@ -6921,8 +6864,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 220 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: slli a2, a3, 2 ; CHECK-RV32-NEXT: bgez a2, .LBB61_1031 @@ -6937,8 +6880,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 223 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1 ; CHECK-RV32-NEXT: bnez a3, .LBB61_753 @@ -6952,8 +6895,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 224 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 2 ; CHECK-RV32-NEXT: bnez a3, .LBB61_754 @@ -6967,8 +6910,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 225 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 4 ; CHECK-RV32-NEXT: bnez a3, .LBB61_755 @@ -6982,8 +6925,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 226 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 8 ; CHECK-RV32-NEXT: bnez a3, .LBB61_756 @@ -6997,8 +6940,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 227 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 16 ; CHECK-RV32-NEXT: bnez a3, .LBB61_757 @@ -7012,8 +6955,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 228 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 32 ; CHECK-RV32-NEXT: bnez a3, .LBB61_758 @@ -7027,8 +6970,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 229 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 64 ; CHECK-RV32-NEXT: bnez a3, .LBB61_759 @@ -7042,8 +6985,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 230 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 128 ; CHECK-RV32-NEXT: bnez a3, .LBB61_760 @@ -7057,8 +7000,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 231 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 256 ; CHECK-RV32-NEXT: bnez a3, .LBB61_761 @@ -7072,8 +7015,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 232 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 512 ; CHECK-RV32-NEXT: bnez a3, .LBB61_762 @@ -7087,8 +7030,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 233 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: andi a3, a2, 1024 ; CHECK-RV32-NEXT: bnez a3, .LBB61_763 @@ -7102,8 +7045,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 234 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 20 ; CHECK-RV32-NEXT: bltz a3, .LBB61_764 @@ -7117,8 +7060,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 235 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 19 ; CHECK-RV32-NEXT: bltz a3, .LBB61_765 @@ -7132,8 +7075,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 236 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 18 ; CHECK-RV32-NEXT: bltz a3, .LBB61_766 @@ -7147,8 +7090,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 237 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 17 ; CHECK-RV32-NEXT: bltz a3, .LBB61_767 @@ -7162,8 +7105,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 238 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 16 ; CHECK-RV32-NEXT: bltz a3, .LBB61_768 @@ -7177,8 +7120,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 239 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 15 ; CHECK-RV32-NEXT: bltz a3, .LBB61_769 @@ -7192,8 +7135,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 240 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 14 ; CHECK-RV32-NEXT: bltz a3, .LBB61_770 @@ -7207,8 +7150,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 241 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 13 ; CHECK-RV32-NEXT: bltz a3, .LBB61_771 @@ -7222,8 +7165,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 242 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 12 ; CHECK-RV32-NEXT: bltz a3, .LBB61_772 @@ -7237,8 +7180,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 243 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 11 ; CHECK-RV32-NEXT: bltz a3, .LBB61_773 @@ -7252,8 +7195,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 244 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 10 ; CHECK-RV32-NEXT: bltz a3, .LBB61_774 @@ -7267,8 +7210,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 245 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 9 ; CHECK-RV32-NEXT: bltz a3, .LBB61_775 @@ -7282,8 +7225,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 246 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 8 ; CHECK-RV32-NEXT: bltz a3, .LBB61_776 @@ -7297,8 +7240,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 247 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 7 ; CHECK-RV32-NEXT: bltz a3, .LBB61_777 @@ -7312,8 +7255,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 248 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 6 ; CHECK-RV32-NEXT: bltz a3, .LBB61_778 @@ -7327,8 +7270,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 249 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 5 ; CHECK-RV32-NEXT: bltz a3, .LBB61_779 @@ -7342,8 +7285,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 250 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 4 ; CHECK-RV32-NEXT: bltz a3, .LBB61_780 @@ -7357,8 +7300,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 251 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 3 ; CHECK-RV32-NEXT: bltz a3, .LBB61_781 @@ -7372,8 +7315,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 252 ; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v16, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v16 ; CHECK-RV32-NEXT: slli a3, a2, 2 ; CHECK-RV32-NEXT: bgez a3, .LBB61_1032 @@ -7388,8 +7331,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV32-NEXT: li a4, 255 ; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV32-NEXT: vslideup.vx v8, v12, a4 -; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: addi a0, a0, 1 ; CHECK-RV32-NEXT: vmv8r.v v8, v24 ; CHECK-RV32-NEXT: andi a2, a3, 1 ; CHECK-RV32-NEXT: bnez a2, .LBB61_783 @@ -10794,13 +10737,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 61 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: .LBB61_63: # %else242 -; CHECK-RV64-NEXT: slli a1, a2, 1 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 1 ; CHECK-RV64-NEXT: bgez a1, .LBB61_65 ; CHECK-RV64-NEXT: # %bb.64: # %cond.load245 ; CHECK-RV64-NEXT: lbu a1, 0(a0) @@ -10810,8 +10753,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 62 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v24 ; CHECK-RV64-NEXT: .LBB61_65: # %else246 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -11074,13 +11017,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 125 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: .LBB61_129: # %else498 -; CHECK-RV64-NEXT: slli a2, a1, 1 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 2 +; CHECK-RV64-NEXT: slli a2, a1, 1 ; CHECK-RV64-NEXT: bgez a2, .LBB61_131 ; CHECK-RV64-NEXT: # %bb.130: # %cond.load501 ; CHECK-RV64-NEXT: lbu a2, 0(a0) @@ -11090,8 +11033,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 126 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v24, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v24 ; CHECK-RV64-NEXT: .LBB61_131: # %else502 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -11354,13 +11297,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 189 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: .LBB61_195: # %else754 -; CHECK-RV64-NEXT: slli a1, a2, 1 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 3 +; CHECK-RV64-NEXT: slli a1, a2, 1 ; CHECK-RV64-NEXT: bgez a1, .LBB61_197 ; CHECK-RV64-NEXT: # %bb.196: # %cond.load757 ; CHECK-RV64-NEXT: lbu a1, 0(a0) @@ -11370,8 +11313,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 190 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v24 ; CHECK-RV64-NEXT: .LBB61_197: # %else758 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -11634,13 +11577,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 253 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: .LBB61_261: # %else1010 -; CHECK-RV64-NEXT: slli a2, a1, 1 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 4 +; CHECK-RV64-NEXT: slli a2, a1, 1 ; CHECK-RV64-NEXT: bgez a2, .LBB61_263 ; CHECK-RV64-NEXT: # %bb.262: # %cond.load1013 ; CHECK-RV64-NEXT: lbu a2, 0(a0) @@ -11650,8 +11593,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 254 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v24 ; CHECK-RV64-NEXT: .LBB61_263: # %else1014 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -11916,9 +11859,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: .LBB61_327: # %else1266 -; CHECK-RV64-NEXT: slli a1, a2, 1 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 5 +; CHECK-RV64-NEXT: slli a1, a2, 1 ; CHECK-RV64-NEXT: bgez a1, .LBB61_329 ; CHECK-RV64-NEXT: # %bb.328: # %cond.load1269 ; CHECK-RV64-NEXT: lbu a1, 0(a0) @@ -12191,9 +12134,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: .LBB61_393: # %else1522 -; CHECK-RV64-NEXT: slli a2, a1, 1 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 6 +; CHECK-RV64-NEXT: slli a2, a1, 1 ; CHECK-RV64-NEXT: bgez a2, .LBB61_395 ; CHECK-RV64-NEXT: # %bb.394: # %cond.load1525 ; CHECK-RV64-NEXT: lbu a2, 0(a0) @@ -12466,9 +12409,9 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 ; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: .LBB61_459: # %else1778 -; CHECK-RV64-NEXT: slli a1, a2, 1 ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 7 +; CHECK-RV64-NEXT: slli a1, a2, 1 ; CHECK-RV64-NEXT: bgez a1, .LBB61_461 ; CHECK-RV64-NEXT: # %bb.460: # %cond.load1781 ; CHECK-RV64-NEXT: lbu a1, 0(a0) @@ -12745,8 +12688,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vsetvli zero, zero, e8, mf8, tu, ma ; CHECK-RV64-NEXT: vmv.s.x v8, a1 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 2 ; CHECK-RV64-NEXT: bnez a1, .LBB61_528 @@ -12757,8 +12700,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 1 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 4 ; CHECK-RV64-NEXT: bnez a1, .LBB61_529 @@ -12769,8 +12712,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 2 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 8 ; CHECK-RV64-NEXT: bnez a1, .LBB61_530 @@ -12781,8 +12724,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 16 ; CHECK-RV64-NEXT: bnez a1, .LBB61_531 @@ -12793,8 +12736,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 4 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 32 ; CHECK-RV64-NEXT: bnez a1, .LBB61_532 @@ -12805,8 +12748,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 5 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 64 ; CHECK-RV64-NEXT: bnez a1, .LBB61_533 @@ -12817,8 +12760,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 6 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 128 ; CHECK-RV64-NEXT: bnez a1, .LBB61_534 @@ -12829,8 +12772,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 7 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 256 ; CHECK-RV64-NEXT: bnez a1, .LBB61_535 @@ -12841,8 +12784,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 8 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 512 ; CHECK-RV64-NEXT: bnez a1, .LBB61_536 @@ -12853,8 +12796,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 9 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 1024 ; CHECK-RV64-NEXT: bnez a1, .LBB61_537 @@ -12865,8 +12808,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 10 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 52 ; CHECK-RV64-NEXT: bltz a1, .LBB61_538 @@ -12877,8 +12820,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 11 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 51 ; CHECK-RV64-NEXT: bltz a1, .LBB61_539 @@ -12889,8 +12832,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 12 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 50 ; CHECK-RV64-NEXT: bltz a1, .LBB61_540 @@ -12901,8 +12844,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 13 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 49 ; CHECK-RV64-NEXT: bltz a1, .LBB61_541 @@ -12913,8 +12856,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 14 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 48 ; CHECK-RV64-NEXT: bltz a1, .LBB61_542 @@ -12925,8 +12868,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 15 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 47 ; CHECK-RV64-NEXT: bltz a1, .LBB61_543 @@ -12937,8 +12880,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 16 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 46 ; CHECK-RV64-NEXT: bltz a1, .LBB61_544 @@ -12949,8 +12892,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 17 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 45 ; CHECK-RV64-NEXT: bltz a1, .LBB61_545 @@ -12961,8 +12904,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 18 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 44 ; CHECK-RV64-NEXT: bltz a1, .LBB61_546 @@ -12973,8 +12916,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 19 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 43 ; CHECK-RV64-NEXT: bltz a1, .LBB61_547 @@ -12985,8 +12928,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 20 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 42 ; CHECK-RV64-NEXT: bltz a1, .LBB61_548 @@ -12997,8 +12940,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 21 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 41 ; CHECK-RV64-NEXT: bltz a1, .LBB61_549 @@ -13009,8 +12952,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 22 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 40 ; CHECK-RV64-NEXT: bltz a1, .LBB61_550 @@ -13021,8 +12964,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 23 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 39 ; CHECK-RV64-NEXT: bltz a1, .LBB61_551 @@ -13033,8 +12976,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 24 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 38 ; CHECK-RV64-NEXT: bltz a1, .LBB61_552 @@ -13045,8 +12988,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 25 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 37 ; CHECK-RV64-NEXT: bltz a1, .LBB61_553 @@ -13057,8 +13000,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 26 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 36 ; CHECK-RV64-NEXT: bltz a1, .LBB61_554 @@ -13069,8 +13012,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 27 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 35 ; CHECK-RV64-NEXT: bltz a1, .LBB61_555 @@ -13081,8 +13024,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 28 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 34 ; CHECK-RV64-NEXT: bltz a1, .LBB61_556 @@ -13093,8 +13036,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 29 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 33 ; CHECK-RV64-NEXT: bltz a1, .LBB61_557 @@ -13105,8 +13048,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: vmv8r.v v16, v8 ; CHECK-RV64-NEXT: vmv.s.x v9, a1 ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 30 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 32 ; CHECK-RV64-NEXT: bltz a1, .LBB61_558 @@ -13119,8 +13062,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a1, 32 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vi v8, v9, 31 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 31 ; CHECK-RV64-NEXT: bltz a1, .LBB61_559 @@ -13134,8 +13077,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 32 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 30 ; CHECK-RV64-NEXT: bltz a1, .LBB61_560 @@ -13149,8 +13092,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 33 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 29 ; CHECK-RV64-NEXT: bltz a1, .LBB61_561 @@ -13164,8 +13107,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 34 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 28 ; CHECK-RV64-NEXT: bltz a1, .LBB61_562 @@ -13179,8 +13122,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 35 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 27 ; CHECK-RV64-NEXT: bltz a1, .LBB61_563 @@ -13194,8 +13137,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 36 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 26 ; CHECK-RV64-NEXT: bltz a1, .LBB61_564 @@ -13209,8 +13152,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 37 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 25 ; CHECK-RV64-NEXT: bltz a1, .LBB61_565 @@ -13224,8 +13167,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 38 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 24 ; CHECK-RV64-NEXT: bltz a1, .LBB61_566 @@ -13239,8 +13182,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 39 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 23 ; CHECK-RV64-NEXT: bltz a1, .LBB61_567 @@ -13254,8 +13197,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 40 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 22 ; CHECK-RV64-NEXT: bltz a1, .LBB61_568 @@ -13269,8 +13212,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 41 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 21 ; CHECK-RV64-NEXT: bltz a1, .LBB61_569 @@ -13284,8 +13227,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 42 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 20 ; CHECK-RV64-NEXT: bltz a1, .LBB61_570 @@ -13299,8 +13242,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 43 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 19 ; CHECK-RV64-NEXT: bltz a1, .LBB61_571 @@ -13314,8 +13257,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 44 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 18 ; CHECK-RV64-NEXT: bltz a1, .LBB61_572 @@ -13329,8 +13272,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 45 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 17 ; CHECK-RV64-NEXT: bltz a1, .LBB61_573 @@ -13344,8 +13287,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 46 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 16 ; CHECK-RV64-NEXT: bltz a1, .LBB61_574 @@ -13359,8 +13302,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 47 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 15 ; CHECK-RV64-NEXT: bltz a1, .LBB61_575 @@ -13374,8 +13317,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 48 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 14 ; CHECK-RV64-NEXT: bltz a1, .LBB61_576 @@ -13389,8 +13332,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 49 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 13 ; CHECK-RV64-NEXT: bltz a1, .LBB61_577 @@ -13404,8 +13347,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 50 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 12 ; CHECK-RV64-NEXT: bltz a1, .LBB61_578 @@ -13419,8 +13362,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 51 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 11 ; CHECK-RV64-NEXT: bltz a1, .LBB61_579 @@ -13434,8 +13377,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 52 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 10 ; CHECK-RV64-NEXT: bltz a1, .LBB61_580 @@ -13449,8 +13392,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 53 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 9 ; CHECK-RV64-NEXT: bltz a1, .LBB61_581 @@ -13464,8 +13407,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 54 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 8 ; CHECK-RV64-NEXT: bltz a1, .LBB61_582 @@ -13479,8 +13422,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 55 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 7 ; CHECK-RV64-NEXT: bltz a1, .LBB61_583 @@ -13494,8 +13437,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 56 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 6 ; CHECK-RV64-NEXT: bltz a1, .LBB61_584 @@ -13509,8 +13452,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 57 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 5 ; CHECK-RV64-NEXT: bltz a1, .LBB61_585 @@ -13524,8 +13467,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 58 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 4 ; CHECK-RV64-NEXT: bltz a1, .LBB61_586 @@ -13539,8 +13482,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 59 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 3 ; CHECK-RV64-NEXT: bltz a1, .LBB61_587 @@ -13554,8 +13497,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 60 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 2 ; CHECK-RV64-NEXT: bgez a1, .LBB61_1025 @@ -13570,8 +13513,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 63 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m1, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v9, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 1 ; CHECK-RV64-NEXT: bnez a2, .LBB61_589 @@ -13585,8 +13528,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 64 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 2 ; CHECK-RV64-NEXT: bnez a2, .LBB61_590 @@ -13600,8 +13543,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 65 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 4 ; CHECK-RV64-NEXT: bnez a2, .LBB61_591 @@ -13615,8 +13558,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 66 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 8 ; CHECK-RV64-NEXT: bnez a2, .LBB61_592 @@ -13630,8 +13573,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 67 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 16 ; CHECK-RV64-NEXT: bnez a2, .LBB61_593 @@ -13645,8 +13588,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 68 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 32 ; CHECK-RV64-NEXT: bnez a2, .LBB61_594 @@ -13660,8 +13603,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 69 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 64 ; CHECK-RV64-NEXT: bnez a2, .LBB61_595 @@ -13675,8 +13618,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 70 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 128 ; CHECK-RV64-NEXT: bnez a2, .LBB61_596 @@ -13690,8 +13633,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 71 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 256 ; CHECK-RV64-NEXT: bnez a2, .LBB61_597 @@ -13705,8 +13648,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 72 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 512 ; CHECK-RV64-NEXT: bnez a2, .LBB61_598 @@ -13720,8 +13663,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 73 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 1024 ; CHECK-RV64-NEXT: bnez a2, .LBB61_599 @@ -13735,8 +13678,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 74 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 52 ; CHECK-RV64-NEXT: bltz a2, .LBB61_600 @@ -13750,8 +13693,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 75 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 51 ; CHECK-RV64-NEXT: bltz a2, .LBB61_601 @@ -13765,8 +13708,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 76 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 50 ; CHECK-RV64-NEXT: bltz a2, .LBB61_602 @@ -13780,8 +13723,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 77 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 49 ; CHECK-RV64-NEXT: bltz a2, .LBB61_603 @@ -13795,8 +13738,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 78 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 48 ; CHECK-RV64-NEXT: bltz a2, .LBB61_604 @@ -13810,8 +13753,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 79 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 47 ; CHECK-RV64-NEXT: bltz a2, .LBB61_605 @@ -13825,8 +13768,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 80 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 46 ; CHECK-RV64-NEXT: bltz a2, .LBB61_606 @@ -13840,8 +13783,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 81 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 45 ; CHECK-RV64-NEXT: bltz a2, .LBB61_607 @@ -13855,8 +13798,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 82 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 44 ; CHECK-RV64-NEXT: bltz a2, .LBB61_608 @@ -13870,8 +13813,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 83 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 43 ; CHECK-RV64-NEXT: bltz a2, .LBB61_609 @@ -13885,8 +13828,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 84 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 42 ; CHECK-RV64-NEXT: bltz a2, .LBB61_610 @@ -13900,8 +13843,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 85 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 41 ; CHECK-RV64-NEXT: bltz a2, .LBB61_611 @@ -13915,8 +13858,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 86 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 40 ; CHECK-RV64-NEXT: bltz a2, .LBB61_612 @@ -13930,8 +13873,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 87 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 39 ; CHECK-RV64-NEXT: bltz a2, .LBB61_613 @@ -13945,8 +13888,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 88 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 38 ; CHECK-RV64-NEXT: bltz a2, .LBB61_614 @@ -13960,8 +13903,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 89 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 37 ; CHECK-RV64-NEXT: bltz a2, .LBB61_615 @@ -13975,8 +13918,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 90 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 36 ; CHECK-RV64-NEXT: bltz a2, .LBB61_616 @@ -13990,8 +13933,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 91 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 35 ; CHECK-RV64-NEXT: bltz a2, .LBB61_617 @@ -14005,8 +13948,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 92 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 34 ; CHECK-RV64-NEXT: bltz a2, .LBB61_618 @@ -14020,8 +13963,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 93 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 33 ; CHECK-RV64-NEXT: bltz a2, .LBB61_619 @@ -14035,8 +13978,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 94 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 32 ; CHECK-RV64-NEXT: bltz a2, .LBB61_620 @@ -14050,8 +13993,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 95 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 31 ; CHECK-RV64-NEXT: bltz a2, .LBB61_621 @@ -14065,8 +14008,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 96 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 30 ; CHECK-RV64-NEXT: bltz a2, .LBB61_622 @@ -14080,8 +14023,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 97 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 29 ; CHECK-RV64-NEXT: bltz a2, .LBB61_623 @@ -14095,8 +14038,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 98 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 28 ; CHECK-RV64-NEXT: bltz a2, .LBB61_624 @@ -14110,8 +14053,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 99 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 27 ; CHECK-RV64-NEXT: bltz a2, .LBB61_625 @@ -14125,8 +14068,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 100 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 26 ; CHECK-RV64-NEXT: bltz a2, .LBB61_626 @@ -14140,8 +14083,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 101 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 25 ; CHECK-RV64-NEXT: bltz a2, .LBB61_627 @@ -14155,8 +14098,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 102 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 24 ; CHECK-RV64-NEXT: bltz a2, .LBB61_628 @@ -14170,8 +14113,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 103 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 23 ; CHECK-RV64-NEXT: bltz a2, .LBB61_629 @@ -14185,8 +14128,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 104 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 22 ; CHECK-RV64-NEXT: bltz a2, .LBB61_630 @@ -14200,8 +14143,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 105 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 21 ; CHECK-RV64-NEXT: bltz a2, .LBB61_631 @@ -14215,8 +14158,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 106 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 20 ; CHECK-RV64-NEXT: bltz a2, .LBB61_632 @@ -14230,8 +14173,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 107 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 19 ; CHECK-RV64-NEXT: bltz a2, .LBB61_633 @@ -14245,8 +14188,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 108 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 18 ; CHECK-RV64-NEXT: bltz a2, .LBB61_634 @@ -14260,8 +14203,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 109 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 17 ; CHECK-RV64-NEXT: bltz a2, .LBB61_635 @@ -14275,8 +14218,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 110 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 16 ; CHECK-RV64-NEXT: bltz a2, .LBB61_636 @@ -14290,8 +14233,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 111 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 15 ; CHECK-RV64-NEXT: bltz a2, .LBB61_637 @@ -14305,8 +14248,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 112 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 14 ; CHECK-RV64-NEXT: bltz a2, .LBB61_638 @@ -14320,8 +14263,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 113 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 13 ; CHECK-RV64-NEXT: bltz a2, .LBB61_639 @@ -14335,8 +14278,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 114 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 12 ; CHECK-RV64-NEXT: bltz a2, .LBB61_640 @@ -14350,8 +14293,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 115 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 11 ; CHECK-RV64-NEXT: bltz a2, .LBB61_641 @@ -14365,8 +14308,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 116 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 10 ; CHECK-RV64-NEXT: bltz a2, .LBB61_642 @@ -14380,8 +14323,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 117 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 9 ; CHECK-RV64-NEXT: bltz a2, .LBB61_643 @@ -14395,8 +14338,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 118 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 8 ; CHECK-RV64-NEXT: bltz a2, .LBB61_644 @@ -14410,8 +14353,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 119 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 7 ; CHECK-RV64-NEXT: bltz a2, .LBB61_645 @@ -14425,8 +14368,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 120 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 6 ; CHECK-RV64-NEXT: bltz a2, .LBB61_646 @@ -14440,8 +14383,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 121 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 5 ; CHECK-RV64-NEXT: bltz a2, .LBB61_647 @@ -14455,8 +14398,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 122 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 4 ; CHECK-RV64-NEXT: bltz a2, .LBB61_648 @@ -14470,8 +14413,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 123 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 3 ; CHECK-RV64-NEXT: bltz a2, .LBB61_649 @@ -14485,8 +14428,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 124 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 2 ; CHECK-RV64-NEXT: bgez a2, .LBB61_1026 @@ -14501,8 +14444,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 127 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v10, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 1 ; CHECK-RV64-NEXT: bnez a1, .LBB61_651 @@ -14516,8 +14459,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 128 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 2 ; CHECK-RV64-NEXT: bnez a1, .LBB61_652 @@ -14531,8 +14474,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 129 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 4 ; CHECK-RV64-NEXT: bnez a1, .LBB61_653 @@ -14546,8 +14489,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 130 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 8 ; CHECK-RV64-NEXT: bnez a1, .LBB61_654 @@ -14561,8 +14504,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 131 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 16 ; CHECK-RV64-NEXT: bnez a1, .LBB61_655 @@ -14576,8 +14519,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 132 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 32 ; CHECK-RV64-NEXT: bnez a1, .LBB61_656 @@ -14591,8 +14534,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 133 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 64 ; CHECK-RV64-NEXT: bnez a1, .LBB61_657 @@ -14606,8 +14549,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 134 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 128 ; CHECK-RV64-NEXT: bnez a1, .LBB61_658 @@ -14621,8 +14564,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 135 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 256 ; CHECK-RV64-NEXT: bnez a1, .LBB61_659 @@ -14636,8 +14579,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 136 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 512 ; CHECK-RV64-NEXT: bnez a1, .LBB61_660 @@ -14651,8 +14594,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 137 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 1024 ; CHECK-RV64-NEXT: bnez a1, .LBB61_661 @@ -14666,8 +14609,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 138 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 52 ; CHECK-RV64-NEXT: bltz a1, .LBB61_662 @@ -14681,8 +14624,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 139 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 51 ; CHECK-RV64-NEXT: bltz a1, .LBB61_663 @@ -14696,8 +14639,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 140 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 50 ; CHECK-RV64-NEXT: bltz a1, .LBB61_664 @@ -14711,8 +14654,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 141 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 49 ; CHECK-RV64-NEXT: bltz a1, .LBB61_665 @@ -14726,8 +14669,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 142 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 48 ; CHECK-RV64-NEXT: bltz a1, .LBB61_666 @@ -14741,8 +14684,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 143 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 47 ; CHECK-RV64-NEXT: bltz a1, .LBB61_667 @@ -14756,8 +14699,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 144 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 46 ; CHECK-RV64-NEXT: bltz a1, .LBB61_668 @@ -14771,8 +14714,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 145 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 45 ; CHECK-RV64-NEXT: bltz a1, .LBB61_669 @@ -14786,8 +14729,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 146 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 44 ; CHECK-RV64-NEXT: bltz a1, .LBB61_670 @@ -14801,8 +14744,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 147 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 43 ; CHECK-RV64-NEXT: bltz a1, .LBB61_671 @@ -14816,8 +14759,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 148 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 42 ; CHECK-RV64-NEXT: bltz a1, .LBB61_672 @@ -14831,8 +14774,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 149 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 41 ; CHECK-RV64-NEXT: bltz a1, .LBB61_673 @@ -14846,8 +14789,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 150 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 40 ; CHECK-RV64-NEXT: bltz a1, .LBB61_674 @@ -14861,8 +14804,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 151 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 39 ; CHECK-RV64-NEXT: bltz a1, .LBB61_675 @@ -14876,8 +14819,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 152 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 38 ; CHECK-RV64-NEXT: bltz a1, .LBB61_676 @@ -14891,8 +14834,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 153 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 37 ; CHECK-RV64-NEXT: bltz a1, .LBB61_677 @@ -14906,8 +14849,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 154 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 36 ; CHECK-RV64-NEXT: bltz a1, .LBB61_678 @@ -14921,8 +14864,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 155 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 35 ; CHECK-RV64-NEXT: bltz a1, .LBB61_679 @@ -14936,8 +14879,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 156 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 34 ; CHECK-RV64-NEXT: bltz a1, .LBB61_680 @@ -14951,8 +14894,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 157 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 33 ; CHECK-RV64-NEXT: bltz a1, .LBB61_681 @@ -14966,8 +14909,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 158 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 32 ; CHECK-RV64-NEXT: bltz a1, .LBB61_682 @@ -14981,8 +14924,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 159 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 31 ; CHECK-RV64-NEXT: bltz a1, .LBB61_683 @@ -14996,8 +14939,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 160 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 30 ; CHECK-RV64-NEXT: bltz a1, .LBB61_684 @@ -15011,8 +14954,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 161 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 29 ; CHECK-RV64-NEXT: bltz a1, .LBB61_685 @@ -15026,8 +14969,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 162 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 28 ; CHECK-RV64-NEXT: bltz a1, .LBB61_686 @@ -15041,8 +14984,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 163 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 27 ; CHECK-RV64-NEXT: bltz a1, .LBB61_687 @@ -15056,8 +14999,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 164 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 26 ; CHECK-RV64-NEXT: bltz a1, .LBB61_688 @@ -15071,8 +15014,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 165 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 25 ; CHECK-RV64-NEXT: bltz a1, .LBB61_689 @@ -15086,8 +15029,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 166 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 24 ; CHECK-RV64-NEXT: bltz a1, .LBB61_690 @@ -15101,8 +15044,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 167 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 23 ; CHECK-RV64-NEXT: bltz a1, .LBB61_691 @@ -15116,8 +15059,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 168 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 22 ; CHECK-RV64-NEXT: bltz a1, .LBB61_692 @@ -15131,8 +15074,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 169 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 21 ; CHECK-RV64-NEXT: bltz a1, .LBB61_693 @@ -15146,8 +15089,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 170 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 20 ; CHECK-RV64-NEXT: bltz a1, .LBB61_694 @@ -15161,8 +15104,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 171 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 19 ; CHECK-RV64-NEXT: bltz a1, .LBB61_695 @@ -15176,8 +15119,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 172 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 18 ; CHECK-RV64-NEXT: bltz a1, .LBB61_696 @@ -15191,8 +15134,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 173 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 17 ; CHECK-RV64-NEXT: bltz a1, .LBB61_697 @@ -15206,8 +15149,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 174 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 16 ; CHECK-RV64-NEXT: bltz a1, .LBB61_698 @@ -15221,8 +15164,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 175 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 15 ; CHECK-RV64-NEXT: bltz a1, .LBB61_699 @@ -15236,8 +15179,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 176 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 14 ; CHECK-RV64-NEXT: bltz a1, .LBB61_700 @@ -15251,8 +15194,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 177 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 13 ; CHECK-RV64-NEXT: bltz a1, .LBB61_701 @@ -15266,8 +15209,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 178 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 12 ; CHECK-RV64-NEXT: bltz a1, .LBB61_702 @@ -15281,8 +15224,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 179 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 11 ; CHECK-RV64-NEXT: bltz a1, .LBB61_703 @@ -15296,8 +15239,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 180 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 10 ; CHECK-RV64-NEXT: bltz a1, .LBB61_704 @@ -15311,8 +15254,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 181 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 9 ; CHECK-RV64-NEXT: bltz a1, .LBB61_705 @@ -15326,8 +15269,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 182 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 8 ; CHECK-RV64-NEXT: bltz a1, .LBB61_706 @@ -15341,8 +15284,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 183 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 7 ; CHECK-RV64-NEXT: bltz a1, .LBB61_707 @@ -15356,8 +15299,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 184 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 6 ; CHECK-RV64-NEXT: bltz a1, .LBB61_708 @@ -15371,8 +15314,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 185 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 5 ; CHECK-RV64-NEXT: bltz a1, .LBB61_709 @@ -15386,8 +15329,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 186 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 4 ; CHECK-RV64-NEXT: bltz a1, .LBB61_710 @@ -15401,8 +15344,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 187 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 3 ; CHECK-RV64-NEXT: bltz a1, .LBB61_711 @@ -15416,8 +15359,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 188 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a1, a2, 2 ; CHECK-RV64-NEXT: bgez a1, .LBB61_1027 @@ -15432,8 +15375,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 191 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 1 ; CHECK-RV64-NEXT: bnez a2, .LBB61_713 @@ -15447,8 +15390,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 192 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 2 ; CHECK-RV64-NEXT: bnez a2, .LBB61_714 @@ -15462,8 +15405,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 193 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 4 ; CHECK-RV64-NEXT: bnez a2, .LBB61_715 @@ -15477,8 +15420,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 194 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 8 ; CHECK-RV64-NEXT: bnez a2, .LBB61_716 @@ -15492,8 +15435,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 195 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 16 ; CHECK-RV64-NEXT: bnez a2, .LBB61_717 @@ -15507,8 +15450,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 196 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 32 ; CHECK-RV64-NEXT: bnez a2, .LBB61_718 @@ -15522,8 +15465,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 197 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 64 ; CHECK-RV64-NEXT: bnez a2, .LBB61_719 @@ -15537,8 +15480,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 198 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 128 ; CHECK-RV64-NEXT: bnez a2, .LBB61_720 @@ -15552,8 +15495,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 199 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 256 ; CHECK-RV64-NEXT: bnez a2, .LBB61_721 @@ -15567,8 +15510,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 200 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 512 ; CHECK-RV64-NEXT: bnez a2, .LBB61_722 @@ -15582,8 +15525,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 201 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a2, a1, 1024 ; CHECK-RV64-NEXT: bnez a2, .LBB61_723 @@ -15597,8 +15540,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 202 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 52 ; CHECK-RV64-NEXT: bltz a2, .LBB61_724 @@ -15612,8 +15555,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 203 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 51 ; CHECK-RV64-NEXT: bltz a2, .LBB61_725 @@ -15627,8 +15570,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 204 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 50 ; CHECK-RV64-NEXT: bltz a2, .LBB61_726 @@ -15642,8 +15585,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 205 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 49 ; CHECK-RV64-NEXT: bltz a2, .LBB61_727 @@ -15657,8 +15600,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 206 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 48 ; CHECK-RV64-NEXT: bltz a2, .LBB61_728 @@ -15672,8 +15615,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 207 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 47 ; CHECK-RV64-NEXT: bltz a2, .LBB61_729 @@ -15687,8 +15630,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 208 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 46 ; CHECK-RV64-NEXT: bltz a2, .LBB61_730 @@ -15702,8 +15645,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 209 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 45 ; CHECK-RV64-NEXT: bltz a2, .LBB61_731 @@ -15717,8 +15660,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 210 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 44 ; CHECK-RV64-NEXT: bltz a2, .LBB61_732 @@ -15732,8 +15675,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 211 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 43 ; CHECK-RV64-NEXT: bltz a2, .LBB61_733 @@ -15747,8 +15690,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 212 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 42 ; CHECK-RV64-NEXT: bltz a2, .LBB61_734 @@ -15762,8 +15705,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 213 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 41 ; CHECK-RV64-NEXT: bltz a2, .LBB61_735 @@ -15777,8 +15720,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 214 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 40 ; CHECK-RV64-NEXT: bltz a2, .LBB61_736 @@ -15792,8 +15735,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 215 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 39 ; CHECK-RV64-NEXT: bltz a2, .LBB61_737 @@ -15807,8 +15750,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 216 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 38 ; CHECK-RV64-NEXT: bltz a2, .LBB61_738 @@ -15822,8 +15765,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 217 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 37 ; CHECK-RV64-NEXT: bltz a2, .LBB61_739 @@ -15837,8 +15780,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 218 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 36 ; CHECK-RV64-NEXT: bltz a2, .LBB61_740 @@ -15852,8 +15795,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 219 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 35 ; CHECK-RV64-NEXT: bltz a2, .LBB61_741 @@ -15867,8 +15810,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 220 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 34 ; CHECK-RV64-NEXT: bltz a2, .LBB61_742 @@ -15882,8 +15825,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 221 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 33 ; CHECK-RV64-NEXT: bltz a2, .LBB61_743 @@ -15897,8 +15840,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 222 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 32 ; CHECK-RV64-NEXT: bltz a2, .LBB61_744 @@ -15912,8 +15855,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 223 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 31 ; CHECK-RV64-NEXT: bltz a2, .LBB61_745 @@ -15927,8 +15870,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 224 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 30 ; CHECK-RV64-NEXT: bltz a2, .LBB61_746 @@ -15942,8 +15885,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 225 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 29 ; CHECK-RV64-NEXT: bltz a2, .LBB61_747 @@ -15957,8 +15900,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 226 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 28 ; CHECK-RV64-NEXT: bltz a2, .LBB61_748 @@ -15972,8 +15915,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 227 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 27 ; CHECK-RV64-NEXT: bltz a2, .LBB61_749 @@ -15987,8 +15930,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 228 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 26 ; CHECK-RV64-NEXT: bltz a2, .LBB61_750 @@ -16002,8 +15945,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 229 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 25 ; CHECK-RV64-NEXT: bltz a2, .LBB61_751 @@ -16017,8 +15960,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 230 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 24 ; CHECK-RV64-NEXT: bltz a2, .LBB61_752 @@ -16032,8 +15975,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 231 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 23 ; CHECK-RV64-NEXT: bltz a2, .LBB61_753 @@ -16047,8 +15990,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 232 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 22 ; CHECK-RV64-NEXT: bltz a2, .LBB61_754 @@ -16062,8 +16005,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 233 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 21 ; CHECK-RV64-NEXT: bltz a2, .LBB61_755 @@ -16077,8 +16020,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 234 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 20 ; CHECK-RV64-NEXT: bltz a2, .LBB61_756 @@ -16092,8 +16035,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 235 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 19 ; CHECK-RV64-NEXT: bltz a2, .LBB61_757 @@ -16107,8 +16050,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 236 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 18 ; CHECK-RV64-NEXT: bltz a2, .LBB61_758 @@ -16122,8 +16065,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 237 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 17 ; CHECK-RV64-NEXT: bltz a2, .LBB61_759 @@ -16137,8 +16080,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 238 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 16 ; CHECK-RV64-NEXT: bltz a2, .LBB61_760 @@ -16152,8 +16095,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 239 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 15 ; CHECK-RV64-NEXT: bltz a2, .LBB61_761 @@ -16167,8 +16110,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 240 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 14 ; CHECK-RV64-NEXT: bltz a2, .LBB61_762 @@ -16182,8 +16125,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 241 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 13 ; CHECK-RV64-NEXT: bltz a2, .LBB61_763 @@ -16197,8 +16140,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 242 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 12 ; CHECK-RV64-NEXT: bltz a2, .LBB61_764 @@ -16212,8 +16155,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 243 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 11 ; CHECK-RV64-NEXT: bltz a2, .LBB61_765 @@ -16227,8 +16170,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 244 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 10 ; CHECK-RV64-NEXT: bltz a2, .LBB61_766 @@ -16242,8 +16185,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 245 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 9 ; CHECK-RV64-NEXT: bltz a2, .LBB61_767 @@ -16257,8 +16200,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 246 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 8 ; CHECK-RV64-NEXT: bltz a2, .LBB61_768 @@ -16272,8 +16215,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 247 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 7 ; CHECK-RV64-NEXT: bltz a2, .LBB61_769 @@ -16287,8 +16230,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 248 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 6 ; CHECK-RV64-NEXT: bltz a2, .LBB61_770 @@ -16302,8 +16245,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 249 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 5 ; CHECK-RV64-NEXT: bltz a2, .LBB61_771 @@ -16317,8 +16260,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 250 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 4 ; CHECK-RV64-NEXT: bltz a2, .LBB61_772 @@ -16332,8 +16275,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 251 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 3 ; CHECK-RV64-NEXT: bltz a2, .LBB61_773 @@ -16347,8 +16290,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 252 ; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: slli a2, a1, 2 ; CHECK-RV64-NEXT: bgez a2, .LBB61_1028 @@ -16363,8 +16306,8 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, < ; CHECK-RV64-NEXT: li a3, 255 ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma ; CHECK-RV64-NEXT: vslideup.vx v8, v12, a3 -; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv4r.v v16, v8 +; CHECK-RV64-NEXT: addi a0, a0, 1 ; CHECK-RV64-NEXT: vmv8r.v v8, v16 ; CHECK-RV64-NEXT: andi a1, a2, 1 ; CHECK-RV64-NEXT: bnez a1, .LBB61_775 diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll index d60ce408278da..2961b880bdceb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll @@ -1330,14 +1330,14 @@ define double @extractelt_nxv16f64_neg1( %v) { ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: addi a0, sp, 64 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a1, -1 ; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: slli a3, a2, 3 +; RV64-NEXT: slli a1, a2, 3 +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: vs8r.v v16, (a1) +; RV64-NEXT: li a1, -1 ; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: slli a2, a2, 1 -; RV64-NEXT: add a3, a0, a3 ; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: vs8r.v v16, (a3) ; RV64-NEXT: bltu a2, a1, .LBB70_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a2, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll index 796f8dde58f47..4664a48a2d668 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll @@ -7,9 +7,9 @@ define i1 @extractelt_nxv1i1(ptr %x, i64 %idx) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmseq.vi v0, v8, 0 -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vslidedown.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -24,9 +24,9 @@ define i1 @extractelt_nxv2i1(ptr %x, i64 %idx) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmseq.vi v0, v8, 0 -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vslidedown.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -41,9 +41,9 @@ define i1 @extractelt_nxv4i1(ptr %x, i64 %idx) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmseq.vi v0, v8, 0 -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vslidedown.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -58,9 +58,9 @@ define i1 @extractelt_nxv8i1(ptr %x, i64 %idx) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vl1r.v v8, (a0) ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmseq.vi v0, v8, 0 -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vslidedown.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -140,14 +140,14 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind { ; RV32-NEXT: sub sp, sp, a3 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: addi a3, sp, 64 -; RV32-NEXT: vl8r.v v8, (a0) ; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: vl8r.v v24, (a0) -; RV32-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV32-NEXT: vmseq.vi v0, v8, 0 +; RV32-NEXT: vl8r.v v8, (a0) +; RV32-NEXT: vsetvli a4, zero, e8, m8, ta, ma ; RV32-NEXT: vmv.v.i v16, 0 ; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: vmseq.vi v0, v8, 0 +; RV32-NEXT: vl8r.v v24, (a0) ; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: vmseq.vi v8, v24, 0 ; RV32-NEXT: vmerge.vim v24, v16, 1, v0 @@ -180,14 +180,14 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind { ; RV64-NEXT: sub sp, sp, a3 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: addi a3, sp, 64 -; RV64-NEXT: vl8r.v v8, (a0) ; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: vl8r.v v24, (a0) -; RV64-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; RV64-NEXT: vmseq.vi v0, v8, 0 +; RV64-NEXT: vl8r.v v8, (a0) +; RV64-NEXT: vsetvli a4, zero, e8, m8, ta, ma ; RV64-NEXT: vmv.v.i v16, 0 ; RV64-NEXT: add a1, a3, a1 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: vmseq.vi v0, v8, 0 +; RV64-NEXT: vl8r.v v24, (a0) ; RV64-NEXT: add a2, a3, a2 ; RV64-NEXT: vmseq.vi v8, v24, 0 ; RV64-NEXT: vmerge.vim v24, v16, 1, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll index a9e129ef11a2c..1546276381021 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll @@ -863,14 +863,14 @@ define i64 @extractelt_nxv16i64_neg1( %v) { ; CHECK-NEXT: andi sp, sp, -64 ; CHECK-NEXT: addi a0, sp, 64 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a1, -1 ; CHECK-NEXT: vs8r.v v8, (a0) -; CHECK-NEXT: slli a3, a2, 3 +; CHECK-NEXT: slli a1, a2, 3 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: li a1, -1 ; CHECK-NEXT: srli a1, a1, 32 ; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: vs8r.v v16, (a3) ; CHECK-NEXT: bltu a2, a1, .LBB74_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll index 1626b362fed15..1263094f3ace0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll @@ -10,11 +10,11 @@ define @ceil_nxv1f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -33,11 +33,11 @@ define @ceil_nxv2f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -56,11 +56,11 @@ define @ceil_nxv4f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -79,11 +79,11 @@ define @ceil_nxv8f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -102,11 +102,11 @@ define @ceil_nxv16f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -125,11 +125,11 @@ define @ceil_nxv32f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -150,9 +150,9 @@ define @ceil_nxv1f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -173,9 +173,9 @@ define @ceil_nxv2f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -196,9 +196,9 @@ define @ceil_nxv4f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -219,9 +219,9 @@ define @ceil_nxv8f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -242,9 +242,9 @@ define @ceil_nxv16f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -263,11 +263,11 @@ define @ceil_nxv1f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -286,11 +286,11 @@ define @ceil_nxv2f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -309,11 +309,11 @@ define @ceil_nxv4f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -332,11 +332,11 @@ define @ceil_nxv8f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll index 4aca2d694dfbb..e8a787f7b615e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll @@ -18,11 +18,11 @@ define @ceil_nxv1bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -41,11 +41,11 @@ define @ceil_nxv2bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -64,11 +64,11 @@ define @ceil_nxv4bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -87,11 +87,11 @@ define @ceil_nxv8bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -110,11 +110,11 @@ define @ceil_nxv16bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -133,11 +133,11 @@ define @ceil_nxv32bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -167,12 +167,12 @@ define @ceil_nxv32bf16( %x) { define @ceil_nxv1f16( %x) { ; ZVFH-LABEL: ceil_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -185,11 +185,11 @@ define @ceil_nxv1f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -206,12 +206,12 @@ declare @llvm.ceil.nxv1f16() define @ceil_nxv2f16( %x) { ; ZVFH-LABEL: ceil_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -224,11 +224,11 @@ define @ceil_nxv2f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -245,12 +245,12 @@ declare @llvm.ceil.nxv2f16() define @ceil_nxv4f16( %x) { ; ZVFH-LABEL: ceil_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -263,11 +263,11 @@ define @ceil_nxv4f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -284,12 +284,12 @@ declare @llvm.ceil.nxv4f16() define @ceil_nxv8f16( %x) { ; ZVFH-LABEL: ceil_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -302,11 +302,11 @@ define @ceil_nxv8f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -323,12 +323,12 @@ declare @llvm.ceil.nxv8f16() define @ceil_nxv16f16( %x) { ; ZVFH-LABEL: ceil_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -341,11 +341,11 @@ define @ceil_nxv16f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -362,12 +362,12 @@ declare @llvm.ceil.nxv16f16() define @ceil_nxv32f16( %x) { ; ZVFH-LABEL: ceil_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -380,11 +380,11 @@ define @ceil_nxv32f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -419,8 +419,8 @@ define @ceil_nxv1f32( %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -439,8 +439,8 @@ define @ceil_nxv2f32( %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -459,8 +459,8 @@ define @ceil_nxv4f32( %x) { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -479,8 +479,8 @@ define @ceil_nxv8f32( %x) { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -499,8 +499,8 @@ define @ceil_nxv16f32( %x) { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -515,12 +515,12 @@ declare @llvm.ceil.nxv16f32() define @ceil_nxv1f64( %x) { ; CHECK-LABEL: ceil_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -535,12 +535,12 @@ declare @llvm.ceil.nxv1f64() define @ceil_nxv2f64( %x) { ; CHECK-LABEL: ceil_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -555,12 +555,12 @@ declare @llvm.ceil.nxv2f64() define @ceil_nxv4f64( %x) { ; CHECK-LABEL: ceil_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -575,12 +575,12 @@ declare @llvm.ceil.nxv4f64() define @ceil_nxv8f64( %x) { ; CHECK-LABEL: ceil_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll index d93f15ec44053..c3d7a9b3e877c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll @@ -10,11 +10,11 @@ define @floor_nxv1f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -33,11 +33,11 @@ define @floor_nxv2f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -56,11 +56,11 @@ define @floor_nxv4f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -79,11 +79,11 @@ define @floor_nxv8f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -102,11 +102,11 @@ define @floor_nxv16f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -125,11 +125,11 @@ define @floor_nxv32f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -150,9 +150,9 @@ define @floor_nxv1f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -173,9 +173,9 @@ define @floor_nxv2f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -196,9 +196,9 @@ define @floor_nxv4f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -219,9 +219,9 @@ define @floor_nxv8f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -242,9 +242,9 @@ define @floor_nxv16f32( %x) strictfp ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -263,11 +263,11 @@ define @floor_nxv1f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -286,11 +286,11 @@ define @floor_nxv2f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -309,11 +309,11 @@ define @floor_nxv4f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -332,11 +332,11 @@ define @floor_nxv8f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll index 010d7786c8891..88cd31f77bbbc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll @@ -18,11 +18,11 @@ define @floor_nxv1bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -42,11 +42,11 @@ define @floor_nxv2bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -66,11 +66,11 @@ define @floor_nxv4bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -90,11 +90,11 @@ define @floor_nxv8bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -114,11 +114,11 @@ define @floor_nxv16bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -138,11 +138,11 @@ define @floor_nxv32bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -173,12 +173,12 @@ declare @llvm.floor.nxv32bf16() define @floor_nxv1f16( %x) { ; ZVFH-LABEL: floor_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -191,11 +191,11 @@ define @floor_nxv1f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -212,12 +212,12 @@ declare @llvm.floor.nxv1f16() define @floor_nxv2f16( %x) { ; ZVFH-LABEL: floor_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -230,11 +230,11 @@ define @floor_nxv2f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -251,12 +251,12 @@ declare @llvm.floor.nxv2f16() define @floor_nxv4f16( %x) { ; ZVFH-LABEL: floor_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -269,11 +269,11 @@ define @floor_nxv4f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -290,12 +290,12 @@ declare @llvm.floor.nxv4f16() define @floor_nxv8f16( %x) { ; ZVFH-LABEL: floor_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -308,11 +308,11 @@ define @floor_nxv8f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -329,12 +329,12 @@ declare @llvm.floor.nxv8f16() define @floor_nxv16f16( %x) { ; ZVFH-LABEL: floor_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -347,11 +347,11 @@ define @floor_nxv16f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -368,12 +368,12 @@ declare @llvm.floor.nxv16f16() define @floor_nxv32f16( %x) { ; ZVFH-LABEL: floor_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -386,11 +386,11 @@ define @floor_nxv32f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -425,8 +425,8 @@ define @floor_nxv1f32( %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -445,8 +445,8 @@ define @floor_nxv2f32( %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -465,8 +465,8 @@ define @floor_nxv4f32( %x) { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -485,8 +485,8 @@ define @floor_nxv8f32( %x) { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -505,8 +505,8 @@ define @floor_nxv16f32( %x) { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -521,12 +521,12 @@ declare @llvm.floor.nxv16f32() define @floor_nxv1f64( %x) { ; CHECK-LABEL: floor_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -541,12 +541,12 @@ declare @llvm.floor.nxv1f64() define @floor_nxv2f64( %x) { ; CHECK-LABEL: floor_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -561,12 +561,12 @@ declare @llvm.floor.nxv2f64() define @floor_nxv4f64( %x) { ; CHECK-LABEL: floor_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -581,12 +581,12 @@ declare @llvm.floor.nxv4f64() define @floor_nxv8f64( %x) { ; CHECK-LABEL: floor_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll index 1752dfd50d0c5..2b973c9b80828 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll @@ -22,27 +22,27 @@ define <512 x i8> @single_source(<512 x i8> %a) { ; CHECK-NEXT: addi a1, sp, 512 ; CHECK-NEXT: vmv.x.s a2, v16 ; CHECK-NEXT: vslidedown.vi v24, v16, 5 -; CHECK-NEXT: li a3, 432 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: li a0, 432 ; CHECK-NEXT: vmv.v.x v8, a2 -; CHECK-NEXT: lbu a0, 770(sp) +; CHECK-NEXT: lbu a1, 770(sp) +; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: li a1, 431 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 -; CHECK-NEXT: lbu a0, 1012(sp) -; CHECK-NEXT: vsetvli zero, a3, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v24, a1 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 4 -; CHECK-NEXT: li a1, 466 -; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: li a0, 465 +; CHECK-NEXT: vslidedown.vi v16, v16, 4 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma +; CHECK-NEXT: vslideup.vx v8, v24, a1 +; CHECK-NEXT: li a0, 466 +; CHECK-NEXT: lbu a1, 1012(sp) +; CHECK-NEXT: vmv.s.x v24, a1 +; CHECK-NEXT: li a1, 465 ; CHECK-NEXT: li a2, 501 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v24, a0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma +; CHECK-NEXT: vslideup.vx v8, v16, a1 ; CHECK-NEXT: li a0, 500 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: vslideup.vx v8, v24, a0 ; CHECK-NEXT: addi sp, s0, -1536 ; CHECK-NEXT: .cfi_def_cfa sp, 1536 ; CHECK-NEXT: ld ra, 1528(sp) # 8-byte Folded Reload @@ -103,12 +103,7 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) { ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: addi s0, sp, 1536 ; CHECK-NEXT: .cfi_def_cfa s0, 0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: andi sp, sp, -512 -; CHECK-NEXT: addi a0, sp, 1520 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: li a0, 512 @@ -127,32 +122,30 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) { ; CHECK-NEXT: li a3, 465 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vse8.v v24, (a1) -; CHECK-NEXT: lbu a1, 985(sp) +; CHECK-NEXT: li a1, 478 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma ; CHECK-NEXT: vslideup.vx v8, v0, a3 -; CHECK-NEXT: li a2, 478 +; CHECK-NEXT: lbu a2, 985(sp) ; CHECK-NEXT: lbu a3, 1012(sp) -; CHECK-NEXT: vmv.s.x v24, a1 -; CHECK-NEXT: li a1, 477 -; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v24, a1 +; CHECK-NEXT: vmv.s.x v24, a2 +; CHECK-NEXT: li a2, 477 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-NEXT: vslideup.vx v8, v24, a2 ; CHECK-NEXT: li a1, 501 +; CHECK-NEXT: vmv.s.x v24, a3 +; CHECK-NEXT: li a2, 500 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-NEXT: vslideup.vx v8, v24, a2 +; CHECK-NEXT: lui a1, %hi(.LCPI2_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI2_0) ; CHECK-NEXT: lui a2, %hi(.LCPI2_1) ; CHECK-NEXT: addi a2, a2, %lo(.LCPI2_1) +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vle8.v v24, (a1) ; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v0, (a2) -; CHECK-NEXT: li a2, 500 -; CHECK-NEXT: vmv.s.x v24, a3 -; CHECK-NEXT: lui a3, %hi(.LCPI2_0) -; CHECK-NEXT: addi a3, a3, %lo(.LCPI2_0) -; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a3) -; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v24, a2 -; CHECK-NEXT: addi a1, sp, 1520 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu -; CHECK-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-NEXT: vrgather.vv v8, v16, v24, v0.t ; CHECK-NEXT: addi sp, s0, -1536 ; CHECK-NEXT: .cfi_def_cfa sp, 1536 ; CHECK-NEXT: ld ra, 1528(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll index 84da351de76ba..5f0088a47af24 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll @@ -417,9 +417,9 @@ declare <32 x i64> @llvm.vp.abs.v32i64(<32 x i64>, i1 immarg, <32 x i1>, i32) define <32 x i64> @vp_abs_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB34_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll index 425422417ec78..753a90c22a366 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast-large-vector.ll @@ -9,10 +9,10 @@ define <512 x i8> @bitcast_1024B(<256 x i16> %a, <512 x i8> %b) { ; VLEN256-NEXT: addi a1, a0, 256 ; VLEN256-NEXT: li a2, 256 ; VLEN256-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; VLEN256-NEXT: vle8.v v24, (a0) -; VLEN256-NEXT: vle8.v v0, (a1) -; VLEN256-NEXT: vadd.vv v8, v24, v8 -; VLEN256-NEXT: vadd.vv v16, v0, v16 +; VLEN256-NEXT: vle8.v v24, (a1) +; VLEN256-NEXT: vle8.v v0, (a0) +; VLEN256-NEXT: vadd.vv v8, v0, v8 +; VLEN256-NEXT: vadd.vv v16, v24, v16 ; VLEN256-NEXT: ret ; ; VLEN512-LABEL: bitcast_1024B: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll index 5ea4924468595..1ba173455a8f8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll @@ -978,60 +978,60 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 24 +; RV32-NEXT: vsrl.vi v10, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsll.vx v10, v8, a2 +; RV32-NEXT: vsll.vx v11, v8, a2 ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vsrl.vx v11, v8, a2 -; RV32-NEXT: vsrl.vx v12, v8, a4 +; RV32-NEXT: vsrl.vx v12, v8, a2 +; RV32-NEXT: vsrl.vx v13, v8, a4 +; RV32-NEXT: vand.vx v9, v9, a5 +; RV32-NEXT: vand.vx v13, v13, a1 +; RV32-NEXT: vor.vv v12, v13, v12 ; RV32-NEXT: vand.vx v13, v8, a1 -; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vor.vv v11, v12, v11 +; RV32-NEXT: vsll.vx v13, v13, a4 +; RV32-NEXT: vor.vv v11, v11, v13 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vlse64.v v12, (a6), zero +; RV32-NEXT: vlse64.v v13, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsll.vx v13, v13, a4 -; RV32-NEXT: vor.vv v10, v10, v13 -; RV32-NEXT: vsrl.vi v13, v8, 8 -; RV32-NEXT: vand.vx v9, v9, a5 -; RV32-NEXT: vand.vv v13, v13, v12 -; RV32-NEXT: vor.vv v9, v13, v9 +; RV32-NEXT: vand.vv v10, v10, v13 +; RV32-NEXT: vor.vv v9, v10, v9 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: vand.vv v12, v8, v12 +; RV32-NEXT: vand.vv v10, v8, v13 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: addi a2, a2, 819 ; RV32-NEXT: addi a3, a3, 1365 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsll.vi v10, v10, 8 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vor.vv v9, v9, v11 +; RV32-NEXT: vor.vv v9, v9, v12 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v11, a2 +; RV32-NEXT: vmv.v.x v12, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vor.vv v8, v11, v8 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a3 +; RV32-NEXT: vmv.v.x v11, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vand.vv v9, v9, v12 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v11 -; RV32-NEXT: vand.vv v9, v9, v11 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vand.vv v9, v9, v12 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vand.vv v9, v9, v11 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: addi sp, sp, 16 @@ -1250,25 +1250,25 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: lui a5, 4080 ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v14, v8, 24 +; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: vsrl.vi v14, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsll.vx v12, v8, a2 ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vsrl.vx v10, v8, a2 -; RV32-NEXT: vsrl.vx v16, v8, a4 -; RV32-NEXT: vand.vx v18, v8, a1 -; RV32-NEXT: vand.vx v16, v16, a1 -; RV32-NEXT: vor.vv v10, v16, v10 +; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v18, v8, a4 +; RV32-NEXT: vand.vx v20, v10, a5 +; RV32-NEXT: vand.vx v10, v18, a1 +; RV32-NEXT: vor.vv v10, v10, v16 +; RV32-NEXT: vand.vx v16, v8, a1 +; RV32-NEXT: vsll.vx v16, v16, a4 +; RV32-NEXT: vor.vv v12, v12, v16 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v16, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsll.vx v18, v18, a4 -; RV32-NEXT: vor.vv v12, v12, v18 -; RV32-NEXT: vsrl.vi v18, v8, 8 -; RV32-NEXT: vand.vx v14, v14, a5 -; RV32-NEXT: vand.vv v18, v18, v16 -; RV32-NEXT: vor.vv v14, v18, v14 +; RV32-NEXT: vand.vv v14, v14, v16 +; RV32-NEXT: vor.vv v14, v14, v20 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: lui a3, 349525 @@ -1523,25 +1523,25 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: lui a5, 4080 ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v20, v8, 24 +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vsrl.vi v20, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsll.vx v16, v8, a2 ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vsrl.vx v12, v8, a2 -; RV32-NEXT: vsrl.vx v24, v8, a4 -; RV32-NEXT: vand.vx v28, v8, a1 -; RV32-NEXT: vand.vx v24, v24, a1 -; RV32-NEXT: vor.vv v12, v24, v12 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vsrl.vx v28, v8, a4 +; RV32-NEXT: vand.vx v4, v12, a5 +; RV32-NEXT: vand.vx v12, v28, a1 +; RV32-NEXT: vor.vv v12, v12, v24 +; RV32-NEXT: vand.vx v24, v8, a1 +; RV32-NEXT: vsll.vx v24, v24, a4 +; RV32-NEXT: vor.vv v16, v16, v24 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v24, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsll.vx v28, v28, a4 -; RV32-NEXT: vor.vv v16, v16, v28 -; RV32-NEXT: vsrl.vi v28, v8, 8 -; RV32-NEXT: vand.vx v20, v20, a5 -; RV32-NEXT: vand.vv v28, v28, v24 -; RV32-NEXT: vor.vv v20, v28, v20 +; RV32-NEXT: vand.vv v20, v20, v24 +; RV32-NEXT: vor.vv v20, v20, v4 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: lui a3, 349525 @@ -1676,35 +1676,36 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV32-NEXT: addi a3, a4, 819 ; RV32-NEXT: sw a3, 32(sp) ; RV32-NEXT: sw a3, 36(sp) -; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: addi a4, a5, 1365 -; RV32-NEXT: vsll.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a5, a6, -256 ; RV32-NEXT: sw a4, 24(sp) ; RV32-NEXT: sw a4, 28(sp) +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: addi a5, a6, -256 ; RV32-NEXT: vand.vx v8, v8, a5, v0.t ; RV32-NEXT: vsll.vx v8, v8, a2, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: slli a6, a6, 4 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 48 +; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v16, v24, a3, v0.t +; RV32-NEXT: vsll.vi v8, v16, 24, v0.t +; RV32-NEXT: addi a6, sp, 48 +; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: slli a4, a4, 3 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v16, v24, a3, v0.t -; RV32-NEXT: vsll.vi v16, v16, 24, v0.t -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vv v16, v24, v8, v0.t ; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: addi a4, sp, 48 ; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb @@ -1739,14 +1740,14 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t @@ -1761,7 +1762,7 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v24, v24, 2, v0.t ; RV32-NEXT: vor.vv v16, v16, v24, v0.t @@ -1869,75 +1870,76 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv8r.v v16, v8 ; RV32-NEXT: lui a1, 1044480 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: lui a4, 349525 ; RV32-NEXT: li a5, 56 ; RV32-NEXT: lui a6, 16 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a5 -; RV32-NEXT: vsrl.vx v24, v8, a5 +; RV32-NEXT: vsll.vx v8, v8, a5 +; RV32-NEXT: vsrl.vx v24, v16, a5 ; RV32-NEXT: li a5, 40 +; RV32-NEXT: addi a6, a6, -256 +; RV32-NEXT: vsrl.vx v0, v16, a5 +; RV32-NEXT: vand.vx v0, v0, a6 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a7, sp, 48 +; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v16, a6 +; RV32-NEXT: lui a6, 4080 +; RV32-NEXT: vsll.vx v0, v0, a5 +; RV32-NEXT: addi a5, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: slli a7, a7, 3 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 48 +; RV32-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v16, 24 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw zero, 20(sp) ; RV32-NEXT: addi a1, a2, -241 +; RV32-NEXT: addi a2, a3, 819 +; RV32-NEXT: addi a3, a4, 1365 +; RV32-NEXT: vand.vx v0, v0, a6 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 4080 -; RV32-NEXT: addi a2, a3, 819 ; RV32-NEXT: sw a2, 32(sp) ; RV32-NEXT: sw a2, 36(sp) -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: addi a3, a4, 1365 -; RV32-NEXT: addi a4, a6, -256 -; RV32-NEXT: vsrl.vx v0, v8, a5 ; RV32-NEXT: sw a3, 24(sp) ; RV32-NEXT: sw a3, 28(sp) -; RV32-NEXT: vand.vx v0, v0, a4 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vx v0, v0, a5 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero +; RV32-NEXT: vlse64.v v8, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a1 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vand.vx v8, v8, a1 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v0, v8, v24 +; RV32-NEXT: vand.vv v24, v24, v8 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vx v16, v16, a6 +; RV32-NEXT: vsll.vi v16, v16, 24 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v0, v16, v8 ; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: addi a2, sp, 32 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v0 +; RV32-NEXT: addi a3, sp, 24 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 48 +; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v16, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -1947,7 +1949,7 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v16, v8 @@ -2072,35 +2074,36 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV32-NEXT: addi a3, a4, 819 ; RV32-NEXT: sw a3, 32(sp) ; RV32-NEXT: sw a3, 36(sp) -; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: addi a4, a5, 1365 -; RV32-NEXT: vsll.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a5, a6, -256 ; RV32-NEXT: sw a4, 24(sp) ; RV32-NEXT: sw a4, 28(sp) +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: addi a5, a6, -256 ; RV32-NEXT: vand.vx v8, v8, a5, v0.t ; RV32-NEXT: vsll.vx v8, v8, a2, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: slli a6, a6, 4 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 48 +; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v16, v24, a3, v0.t +; RV32-NEXT: vsll.vi v8, v16, 24, v0.t +; RV32-NEXT: addi a6, sp, 48 +; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: slli a4, a4, 3 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v16, v24, a3, v0.t -; RV32-NEXT: vsll.vi v16, v16, 24, v0.t -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vv v16, v24, v8, v0.t ; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: addi a4, sp, 48 ; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb @@ -2135,14 +2138,14 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t @@ -2157,7 +2160,7 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v24, v24, 2, v0.t ; RV32-NEXT: vor.vv v16, v16, v24, v0.t @@ -2265,75 +2268,76 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv8r.v v16, v8 ; RV32-NEXT: lui a1, 1044480 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: lui a4, 349525 ; RV32-NEXT: li a5, 56 ; RV32-NEXT: lui a6, 16 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a5 -; RV32-NEXT: vsrl.vx v24, v8, a5 +; RV32-NEXT: vsll.vx v8, v8, a5 +; RV32-NEXT: vsrl.vx v24, v16, a5 ; RV32-NEXT: li a5, 40 +; RV32-NEXT: addi a6, a6, -256 +; RV32-NEXT: vsrl.vx v0, v16, a5 +; RV32-NEXT: vand.vx v0, v0, a6 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a7, sp, 48 +; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v16, a6 +; RV32-NEXT: lui a6, 4080 +; RV32-NEXT: vsll.vx v0, v0, a5 +; RV32-NEXT: addi a5, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: slli a7, a7, 3 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 48 +; RV32-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v16, 24 ; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: sw zero, 20(sp) ; RV32-NEXT: addi a1, a2, -241 +; RV32-NEXT: addi a2, a3, 819 +; RV32-NEXT: addi a3, a4, 1365 +; RV32-NEXT: vand.vx v0, v0, a6 ; RV32-NEXT: sw a1, 40(sp) ; RV32-NEXT: sw a1, 44(sp) -; RV32-NEXT: lui a1, 4080 -; RV32-NEXT: addi a2, a3, 819 ; RV32-NEXT: sw a2, 32(sp) ; RV32-NEXT: sw a2, 36(sp) -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: addi a3, a4, 1365 -; RV32-NEXT: addi a4, a6, -256 -; RV32-NEXT: vsrl.vx v0, v8, a5 ; RV32-NEXT: sw a3, 24(sp) ; RV32-NEXT: sw a3, 28(sp) -; RV32-NEXT: vand.vx v0, v0, a4 -; RV32-NEXT: vor.vv v24, v0, v24 -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a4 -; RV32-NEXT: vsll.vx v0, v0, a5 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero +; RV32-NEXT: vlse64.v v8, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a1 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vand.vx v8, v8, a1 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v0, v8, v24 +; RV32-NEXT: vand.vv v24, v24, v8 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vx v16, v16, a6 +; RV32-NEXT: vsll.vi v16, v16, 24 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v0, v16, v8 ; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: addi a1, sp, 40 ; RV32-NEXT: addi a2, sp, 32 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v0 +; RV32-NEXT: addi a3, sp, 24 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 48 +; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v16, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero -; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -2343,7 +2347,7 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v16, v8 @@ -2455,9 +2459,9 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 8 +; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB34_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll index d765e4c0b8f6a..37caf61aac19c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll @@ -370,31 +370,31 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 24 +; RV32-NEXT: vsrl.vi v10, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsll.vx v10, v8, a2 +; RV32-NEXT: vsll.vx v11, v8, a2 ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vsrl.vx v11, v8, a2 -; RV32-NEXT: vsrl.vx v12, v8, a4 +; RV32-NEXT: vsrl.vx v12, v8, a2 +; RV32-NEXT: vsrl.vx v13, v8, a4 +; RV32-NEXT: vand.vx v9, v9, a5 +; RV32-NEXT: vand.vx v13, v13, a1 +; RV32-NEXT: vor.vv v12, v13, v12 ; RV32-NEXT: vand.vx v13, v8, a1 -; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vor.vv v11, v12, v11 +; RV32-NEXT: vsll.vx v13, v13, a4 +; RV32-NEXT: vor.vv v11, v11, v13 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vlse64.v v12, (a6), zero +; RV32-NEXT: vlse64.v v13, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsll.vx v13, v13, a4 -; RV32-NEXT: vor.vv v10, v10, v13 -; RV32-NEXT: vsrl.vi v13, v8, 8 -; RV32-NEXT: vand.vx v9, v9, a5 -; RV32-NEXT: vand.vv v13, v13, v12 -; RV32-NEXT: vor.vv v9, v13, v9 -; RV32-NEXT: vand.vv v12, v8, v12 +; RV32-NEXT: vand.vv v10, v10, v13 +; RV32-NEXT: vor.vv v9, v10, v9 +; RV32-NEXT: vand.vv v10, v8, v13 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v12, v12, 8 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vor.vv v9, v9, v11 +; RV32-NEXT: vsll.vi v10, v10, 8 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v11, v8 +; RV32-NEXT: vor.vv v9, v9, v12 ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -530,31 +530,31 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: vsrl.vi v12, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsll.vx v12, v8, a2 +; RV32-NEXT: vsll.vx v14, v8, a2 ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vsrl.vx v14, v8, a2 -; RV32-NEXT: vsrl.vx v16, v8, a4 +; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v18, v8, a4 +; RV32-NEXT: vand.vx v10, v10, a5 +; RV32-NEXT: vand.vx v18, v18, a1 +; RV32-NEXT: vor.vv v16, v18, v16 ; RV32-NEXT: vand.vx v18, v8, a1 -; RV32-NEXT: vand.vx v16, v16, a1 -; RV32-NEXT: vor.vv v14, v16, v14 +; RV32-NEXT: vsll.vx v18, v18, a4 +; RV32-NEXT: vor.vv v14, v14, v18 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vlse64.v v16, (a6), zero +; RV32-NEXT: vlse64.v v18, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsll.vx v18, v18, a4 -; RV32-NEXT: vor.vv v12, v12, v18 -; RV32-NEXT: vsrl.vi v18, v8, 8 -; RV32-NEXT: vand.vx v10, v10, a5 -; RV32-NEXT: vand.vv v18, v18, v16 -; RV32-NEXT: vor.vv v10, v18, v10 -; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vv v12, v12, v18 +; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vand.vv v12, v8, v18 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v16, v16, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vor.vv v10, v10, v14 +; RV32-NEXT: vsll.vi v12, v12, 8 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v14, v8 +; RV32-NEXT: vor.vv v10, v10, v16 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -690,31 +690,31 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vsrl.vi v16, v8, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsll.vx v16, v8, a2 +; RV32-NEXT: vsll.vx v20, v8, a2 ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vsrl.vx v20, v8, a2 -; RV32-NEXT: vsrl.vx v24, v8, a4 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vsrl.vx v28, v8, a4 +; RV32-NEXT: vand.vx v12, v12, a5 +; RV32-NEXT: vand.vx v28, v28, a1 +; RV32-NEXT: vor.vv v24, v28, v24 ; RV32-NEXT: vand.vx v28, v8, a1 -; RV32-NEXT: vand.vx v24, v24, a1 -; RV32-NEXT: vor.vv v20, v24, v20 +; RV32-NEXT: vsll.vx v28, v28, a4 +; RV32-NEXT: vor.vv v20, v20, v28 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vlse64.v v24, (a6), zero +; RV32-NEXT: vlse64.v v28, (a6), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsll.vx v28, v28, a4 -; RV32-NEXT: vor.vv v16, v16, v28 -; RV32-NEXT: vsrl.vi v28, v8, 8 -; RV32-NEXT: vand.vx v12, v12, a5 -; RV32-NEXT: vand.vv v28, v28, v24 -; RV32-NEXT: vor.vv v12, v28, v12 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vv v16, v16, v28 +; RV32-NEXT: vor.vv v12, v16, v12 +; RV32-NEXT: vand.vv v16, v8, v28 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vor.vv v12, v12, v20 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v20, v8 +; RV32-NEXT: vor.vv v12, v12, v24 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -768,61 +768,63 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv8r.v v16, v8 ; RV32-NEXT: lui a1, 1044480 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 -; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a2, v0.t +; RV32-NEXT: vsll.vx v8, v8, a2, v0.t ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vand.vx v24, v8, a1, v0.t +; RV32-NEXT: vand.vx v24, v16, a1, v0.t ; RV32-NEXT: vsll.vx v24, v24, a4, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v8, v16, a5, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v8, (a6), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 4080 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v24, v8, a3, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: vand.vv v24, v16, v8, v0.t +; RV32-NEXT: vsll.vi v8, v24, 8, v0.t ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsll.vi v16, v24, 8, v0.t ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v8, v16, a2, v0.t +; RV32-NEXT: vsrl.vx v24, v16, a4, v0.t ; RV32-NEXT: vand.vx v24, v24, a1, v0.t -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t -; RV32-NEXT: vand.vx v24, v24, a3, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a5, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 @@ -916,48 +918,48 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v24, v8, a2 -; RV32-NEXT: addi a1, a3, -256 ; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vand.vx v0, v0, a3 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a1 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: slli a6, a6, 3 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a3 ; RV32-NEXT: vsll.vx v0, v0, a4 ; RV32-NEXT: vor.vv v16, v24, v0 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a6), zero +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -1031,61 +1033,63 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv8r.v v16, v8 ; RV32-NEXT: lui a1, 1044480 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 -; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: lui a5, 4080 +; RV32-NEXT: addi a6, sp, 8 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v16, v8, a2, v0.t +; RV32-NEXT: vsll.vx v8, v8, a2, v0.t ; RV32-NEXT: addi a1, a3, -256 -; RV32-NEXT: vand.vx v24, v8, a1, v0.t +; RV32-NEXT: vand.vx v24, v16, a1, v0.t ; RV32-NEXT: vsll.vx v24, v24, a4, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v8, v16, a5, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vlse64.v v8, (a6), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 4080 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vx v24, v8, a3, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: vand.vv v24, v16, v8, v0.t +; RV32-NEXT: vsll.vi v8, v24, 8, v0.t ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsll.vi v16, v24, 8, v0.t ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v8, v16, a2, v0.t +; RV32-NEXT: vsrl.vx v24, v16, a4, v0.t ; RV32-NEXT: vand.vx v24, v24, a1, v0.t -; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t -; RV32-NEXT: vand.vx v24, v24, a3, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a5, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 @@ -1179,48 +1183,48 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: lui a3, 16 ; RV32-NEXT: li a4, 40 ; RV32-NEXT: lui a5, 4080 -; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsll.vx v24, v8, a2 -; RV32-NEXT: addi a1, a3, -256 ; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vsrl.vx v0, v8, a4 -; RV32-NEXT: vand.vx v0, v0, a1 +; RV32-NEXT: vand.vx v0, v0, a3 ; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a1 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: slli a6, a6, 3 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a3 ; RV32-NEXT: vsll.vx v0, v0, a4 ; RV32-NEXT: vor.vv v16, v24, v0 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: vand.vx v0, v0, a5 +; RV32-NEXT: vsrl.vi v24, v8, 8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a6), zero +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a5 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 @@ -1298,9 +1302,9 @@ define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 8 +; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll index dbbb8362144ca..781c61b571994 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll @@ -9,10 +9,10 @@ define <4 x i32> @add_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI0_0) -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %e0 = add i32 %a, 23 @@ -37,10 +37,10 @@ define <8 x i32> @add_constant_rhs_8xi32(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, ; CHECK-NEXT: vslide1down.vx v8, v8, a2 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 ; CHECK-NEXT: vslide1down.vx v8, v8, a4 -; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vslide1down.vx v8, v8, a5 ; CHECK-NEXT: vslide1down.vx v8, v8, a6 ; CHECK-NEXT: vslide1down.vx v8, v8, a7 +; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %e0 = add i32 %a, 23 @@ -70,10 +70,10 @@ define <4 x i32> @sub_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI2_0) -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: ret %e0 = sub i32 %a, 23 @@ -94,10 +94,10 @@ define <4 x i32> @mul_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI3_0) -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vmul.vv v8, v8, v9 ; CHECK-NEXT: ret %e0 = mul i32 %a, 23 @@ -125,15 +125,15 @@ define <4 x i32> @udiv_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: lui a0, %hi(.LCPI4_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_1) ; CHECK-NEXT: vslide1down.vx v9, v9, a1 -; CHECK-NEXT: vle32.v v11, (a0) ; CHECK-NEXT: vslide1down.vx v8, v8, a2 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 ; CHECK-NEXT: vmulhu.vv v10, v8, v10 -; CHECK-NEXT: vsub.vv v12, v8, v10 -; CHECK-NEXT: vmulhu.vv v9, v12, v9 +; CHECK-NEXT: vsub.vv v11, v8, v10 +; CHECK-NEXT: vmulhu.vv v9, v11, v9 +; CHECK-NEXT: vle32.v v11, (a0) ; CHECK-NEXT: vadd.vv v9, v9, v10 -; CHECK-NEXT: vmv.v.i v0, 4 ; CHECK-NEXT: vsrl.vv v9, v9, v11 +; CHECK-NEXT: vmv.v.i v0, 4 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %e0 = udiv i32 %a, 23 @@ -155,10 +155,10 @@ define <4 x float> @fadd_constant_rhs(float %a, float %b, float %c, float %d) { ; CHECK-NEXT: vfmv.v.f v8, fa0 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0) -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa3 +; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v9 ; CHECK-NEXT: ret %e0 = fadd float %a, 23.0 @@ -179,10 +179,10 @@ define <4 x float> @fdiv_constant_rhs(float %a, float %b, float %c, float %d) { ; CHECK-NEXT: vfmv.v.f v8, fa0 ; CHECK-NEXT: lui a0, %hi(.LCPI6_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0) -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa3 +; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vfdiv.vv v8, v8, v9 ; CHECK-NEXT: ret %e0 = fdiv float %a, 23.0 @@ -317,10 +317,10 @@ define <4 x i32> @add_constant_rhs_inverse(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI11_0) -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %e0 = sub i32 %a, 1 @@ -341,10 +341,10 @@ define <4 x i32> @add_constant_rhs_commute(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI12_0) -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 +; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %e0 = add i32 %a, 23 @@ -562,21 +562,20 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmv.s.x v12, a1 ; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vmv.s.x v10, a2 -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0) +; CHECK-NEXT: vmv.s.x v10, a1 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 5 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vslideup.vi v8, v10, 5 +; CHECK-NEXT: vmv.s.x v10, a2 ; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 6 ; CHECK-NEXT: vmv.s.x v10, a3 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0) ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 7 -; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %vadd = add <8 x i32> %vin, %e0 = add i32 %a, 23 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll index 60a9948198c8f..78a6acfac4581 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll @@ -87,14 +87,14 @@ define fastcc <128 x i32> @ret_split_v128i32(ptr %x) { ; CHECK-NEXT: addi a2, a1, 256 ; CHECK-NEXT: vle32.v v16, (a2) ; CHECK-NEXT: addi a2, a1, 384 -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v0, (a2) -; CHECK-NEXT: addi a2, a0, 256 -; CHECK-NEXT: vse32.v v24, (a0) +; CHECK-NEXT: vle32.v v24, (a2) +; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vse32.v v0, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vse32.v v0, (a1) -; CHECK-NEXT: vse32.v v16, (a2) +; CHECK-NEXT: vse32.v v24, (a2) +; CHECK-NEXT: vse32.v v16, (a1) ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %v = load <128 x i32>, ptr %x @@ -207,14 +207,15 @@ define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x ; CHECK-NEXT: addi s0, sp, 256 ; CHECK-NEXT: .cfi_def_cfa s0, 0 ; CHECK-NEXT: andi sp, sp, -128 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: mv a3, sp +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vse32.v v24, (a0) ; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: li a2, 42 -; CHECK-NEXT: vse32.v v8, (a3) -; CHECK-NEXT: vmv.v.v v8, v24 ; CHECK-NEXT: call ext3 ; CHECK-NEXT: addi sp, s0, -256 ; CHECK-NEXT: .cfi_def_cfa sp, 256 @@ -269,8 +270,8 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3 ; CHECK-NEXT: mv t3, sp ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: li t4, 8 ; CHECK-NEXT: vse32.v v8, (t0) +; CHECK-NEXT: li t4, 8 ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: call vector_arg_indirect_stack @@ -307,17 +308,15 @@ define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3 define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) { ; CHECK-LABEL: pass_vector_arg_direct_stack: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -176 -; CHECK-NEXT: .cfi_def_cfa_offset 176 -; CHECK-NEXT: sd ra, 168(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 160(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 160 +; CHECK-NEXT: sd ra, 152(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 -; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: addi t0, sp, 16 ; CHECK-NEXT: li t1, 1 ; CHECK-NEXT: li t2, 13 -; CHECK-NEXT: li s0, 12 +; CHECK-NEXT: li t5, 12 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: li a2, 2 ; CHECK-NEXT: li a3, 3 @@ -326,23 +325,21 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> ; CHECK-NEXT: li a6, 6 ; CHECK-NEXT: li a7, 7 ; CHECK-NEXT: li t3, 8 +; CHECK-NEXT: sd t1, 144(sp) +; CHECK-NEXT: li t4, 9 +; CHECK-NEXT: sd t5, 0(sp) +; CHECK-NEXT: sd t2, 8(sp) +; CHECK-NEXT: li t5, 10 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vse32.v v8, (t0) -; CHECK-NEXT: li t4, 9 -; CHECK-NEXT: li t5, 10 -; CHECK-NEXT: sd t1, 144(sp) ; CHECK-NEXT: li t6, 11 -; CHECK-NEXT: sd s0, 0(sp) -; CHECK-NEXT: sd t2, 8(sp) ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: call vector_arg_direct_stack -; CHECK-NEXT: ld ra, 168(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 160(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; CHECK-NEXT: .cfi_restore ra -; CHECK-NEXT: .cfi_restore s0 -; CHECK-NEXT: addi sp, sp, 176 +; CHECK-NEXT: addi sp, sp, 160 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %s = call fastcc <32 x i32> @vector_arg_direct_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll index f42b4a3a26aad..34600d9a0eaf4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll @@ -87,14 +87,14 @@ define <128 x i32> @ret_split_v128i32(ptr %x) { ; CHECK-NEXT: addi a2, a1, 256 ; CHECK-NEXT: vle32.v v16, (a2) ; CHECK-NEXT: addi a2, a1, 384 -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v0, (a2) -; CHECK-NEXT: addi a2, a0, 256 -; CHECK-NEXT: vse32.v v24, (a0) +; CHECK-NEXT: vle32.v v24, (a2) +; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vse32.v v0, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vse32.v v0, (a1) -; CHECK-NEXT: vse32.v v16, (a2) +; CHECK-NEXT: vse32.v v24, (a2) +; CHECK-NEXT: vse32.v v16, (a1) ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %v = load <128 x i32>, ptr %x @@ -207,14 +207,15 @@ define <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x ; CHECK-NEXT: addi s0, sp, 256 ; CHECK-NEXT: .cfi_def_cfa s0, 0 ; CHECK-NEXT: andi sp, sp, -128 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: mv a3, sp +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vse32.v v24, (a0) ; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: li a2, 42 -; CHECK-NEXT: vse32.v v8, (a3) -; CHECK-NEXT: vmv.v.v v8, v24 ; CHECK-NEXT: call ext3 ; CHECK-NEXT: addi sp, s0, -256 ; CHECK-NEXT: .cfi_def_cfa sp, 256 @@ -267,9 +268,9 @@ define <32 x i32> @call_split_vector_args(ptr %pa, ptr %pb) { ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a1) -; CHECK-NEXT: mv a1, sp ; CHECK-NEXT: mv a0, sp -; CHECK-NEXT: vse32.v v16, (a1) +; CHECK-NEXT: vse32.v v16, (a0) +; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: vmv1r.v v9, v8 ; CHECK-NEXT: vmv1r.v v10, v8 ; CHECK-NEXT: vmv1r.v v11, v8 @@ -313,7 +314,7 @@ define <32 x i32> @pass_vector_arg_via_stack(<32 x i32> %x, <32 x i32> %y, <32 x ; CHECK-NEXT: sd ra, 136(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: li t0, 8 +; CHECK-NEXT: li a7, 8 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: li a2, 2 ; CHECK-NEXT: li a3, 3 @@ -322,9 +323,9 @@ define <32 x i32> @pass_vector_arg_via_stack(<32 x i32> %x, <32 x i32> %y, <32 x ; CHECK-NEXT: li a6, 6 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: sd a7, 128(sp) ; CHECK-NEXT: vse32.v v8, (sp) ; CHECK-NEXT: li a7, 7 -; CHECK-NEXT: sd t0, 128(sp) ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: call vector_arg_via_stack @@ -378,8 +379,8 @@ define <4 x i1> @pass_vector_mask_arg_via_stack(<4 x i1> %v) { ; CHECK-NEXT: vmv.v.v v17, v16 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmsne.vi v16, v17, 0 -; CHECK-NEXT: li a7, 7 ; CHECK-NEXT: vsm.v v16, (a2) +; CHECK-NEXT: li a7, 7 ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: li a2, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll index a9b255bb62aeb..3c79f42177721 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll @@ -13,13 +13,13 @@ declare <2 x half> @llvm.vp.ceil.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -35,12 +35,12 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v11, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v11, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -59,12 +59,12 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -77,11 +77,11 @@ define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -99,13 +99,13 @@ declare <4 x half> @llvm.vp.ceil.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -121,12 +121,12 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v11, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v11, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vmv.v.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -145,12 +145,12 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -163,11 +163,11 @@ define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -185,13 +185,13 @@ declare <8 x half> @llvm.vp.ceil.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -207,12 +207,12 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v12, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v12, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v12, v10, v0.t @@ -231,12 +231,12 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -249,11 +249,11 @@ define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -273,12 +273,12 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmv1r.v v10, v0 +; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) -; ZVFH-NEXT: vfabs.v v12, v8, v0.t +; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vmv1r.v v0, v10 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -295,12 +295,12 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v12, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v16, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v12, v0.t @@ -319,12 +319,12 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: fsrmi a0, 3 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -337,11 +337,11 @@ define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -363,9 +363,9 @@ define <2 x float> @vp_ceil_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %evl ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -384,8 +384,8 @@ define <2 x float> @vp_ceil_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -405,9 +405,9 @@ define <4 x float> @vp_ceil_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -426,8 +426,8 @@ define <4 x float> @vp_ceil_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -448,9 +448,9 @@ define <8 x float> @vp_ceil_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -470,8 +470,8 @@ define <8 x float> @vp_ceil_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -492,9 +492,9 @@ define <16 x float> @vp_ceil_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -514,8 +514,8 @@ define <16 x float> @vp_ceil_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl) ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -531,13 +531,13 @@ declare <2 x double> @llvm.vp.ceil.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_ceil_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI16_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -552,12 +552,12 @@ define <2 x double> @vp_ceil_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %e define <2 x double> @vp_ceil_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -575,12 +575,12 @@ define <4 x double> @vp_ceil_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -596,12 +596,12 @@ define <4 x double> @vp_ceil_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %e define <4 x double> @vp_ceil_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -619,12 +619,12 @@ define <8 x double> @vp_ceil_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -640,12 +640,12 @@ define <8 x double> @vp_ceil_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %e define <8 x double> @vp_ceil_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI21_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -663,12 +663,12 @@ define <15 x double> @vp_ceil_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -684,12 +684,12 @@ define <15 x double> @vp_ceil_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroex define <15 x double> @vp_ceil_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v15f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI23_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -707,12 +707,12 @@ define <16 x double> @vp_ceil_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -728,12 +728,12 @@ define <16 x double> @vp_ceil_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroex define <16 x double> @vp_ceil_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v16f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI25_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -762,8 +762,8 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -778,33 +778,33 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t -; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll index 9d0d42cf754c5..99007aaa8a106 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -1503,38 +1503,29 @@ declare <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32) define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -1547,57 +1538,34 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vor.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1671,47 +1639,49 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vx v0, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vnot.v v0, v8 -; RV32-NEXT: vsrl.vi v8, v0, 1 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v24, v0, v24 -; RV32-NEXT: vand.vv v0, v24, v16 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v0, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v0 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1775,38 +1745,29 @@ declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32) define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -1819,57 +1780,34 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vor.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1943,47 +1881,49 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vx v0, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vnot.v v0, v8 -; RV32-NEXT: vsrl.vi v8, v0, 1 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v24, v0, v24 -; RV32-NEXT: vand.vv v0, v24, v16 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v0, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v0 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -2055,7 +1995,8 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill @@ -2072,12 +2013,12 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: sw a2, 36(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a1, 16 -; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a2, 16(sp) ; RV32-NEXT: sw a2, 20(sp) +; RV32-NEXT: li a1, 16 ; RV32-NEXT: mv a2, a0 ; RV32-NEXT: bltu a0, a1, .LBB34_2 ; RV32-NEXT: # %bb.1: @@ -2087,7 +2028,6 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: li a1, 32 ; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: addi a4, sp, 32 ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -2102,34 +2042,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a5, 40 -; RV32-NEXT: mul a3, a3, a5 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a4), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 @@ -2137,38 +2068,41 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v16, v16, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill @@ -2180,61 +2114,37 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a3, a3, a5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a4), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -2244,7 +2154,8 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload @@ -2266,18 +2177,20 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -2290,41 +2203,25 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 +; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2332,21 +2229,20 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -2372,9 +2268,9 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: li a2, 16 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: mv a1, a0 ; RV64-NEXT: bltu a0, a2, .LBB34_2 ; RV64-NEXT: # %bb.1: @@ -2495,14 +2391,14 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a2, 36(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: addi a1, a2, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) +; RV32-NEXT: li a2, 16 ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a3, .LBB35_2 +; RV32-NEXT: bltu a0, a2, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: @@ -2550,76 +2446,58 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsrl.vi v0, v16, 8 ; RV32-NEXT: vor.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 16 -; RV32-NEXT: vor.vv v16, v16, v8 +; RV32-NEXT: vsrl.vi v0, v16, 16 +; RV32-NEXT: vor.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v0, 1 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 48 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vsub.vv v0, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v8, v16, a2 +; RV32-NEXT: vor.vv v24, v16, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v24, v0, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v0, v16, a2 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v24, v8 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v24, v24, v8 -; RV32-NEXT: vadd.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v0, v8 +; RV32-NEXT: vsrl.vi v0, v0, 2 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16 -; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: vnot.v v24, v24 +; RV32-NEXT: vsrl.vi v0, v24, 1 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsub.vv v0, v16, v0 -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vsub.vv v24, v24, v0 +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v24, 4 -; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v0, v8 -; RV32-NEXT: vsrl.vi v0, v0, 2 -; RV32-NEXT: vand.vv v8, v0, v8 +; RV32-NEXT: vand.vv v0, v24, v8 +; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v8, v24, v8 +; RV32-NEXT: addi a2, sp, 24 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero +; RV32-NEXT: vlse64.v v24, (a2), zero +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a3), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v0 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -4213,38 +4091,29 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4257,57 +4126,34 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vor.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -4381,47 +4227,49 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vx v0, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vnot.v v0, v8 -; RV32-NEXT: vsrl.vi v8, v0, 1 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v24, v0, v24 -; RV32-NEXT: vand.vv v0, v24, v16 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v0, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v0 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -4483,38 +4331,29 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4527,57 +4366,34 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vor.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -4651,47 +4467,49 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vx v0, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vnot.v v0, v8 -; RV32-NEXT: vsrl.vi v8, v0, 1 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v24, v0, v24 -; RV32-NEXT: vand.vv v0, v24, v16 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v0, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v0 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -4761,7 +4579,8 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill @@ -4778,12 +4597,12 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: sw a2, 36(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a1, 16 -; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a2, 16(sp) ; RV32-NEXT: sw a2, 20(sp) +; RV32-NEXT: li a1, 16 ; RV32-NEXT: mv a2, a0 ; RV32-NEXT: bltu a0, a1, .LBB70_2 ; RV32-NEXT: # %bb.1: @@ -4793,7 +4612,6 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: li a1, 32 ; RV32-NEXT: addi a3, sp, 40 -; RV32-NEXT: addi a4, sp, 32 ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -4808,34 +4626,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a5, 40 -; RV32-NEXT: mul a3, a3, a5 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v8, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a4), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 @@ -4843,38 +4652,41 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v16, v16, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 48 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill @@ -4886,61 +4698,37 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a3, a3, a5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a4), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -4950,7 +4738,8 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload @@ -4972,18 +4761,20 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -4996,41 +4787,25 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 +; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -5038,21 +4813,20 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -5078,9 +4852,9 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: li a2, 16 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: mv a1, a0 ; RV64-NEXT: bltu a0, a2, .LBB70_2 ; RV64-NEXT: # %bb.1: @@ -5201,14 +4975,14 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: sw a2, 36(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: addi a1, a2, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) +; RV32-NEXT: li a2, 16 ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a3, .LBB71_2 +; RV32-NEXT: bltu a0, a2, .LBB71_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB71_2: @@ -5256,76 +5030,58 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vsrl.vi v0, v16, 8 ; RV32-NEXT: vor.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vnot.v v0, v8 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 16 -; RV32-NEXT: vor.vv v16, v16, v8 +; RV32-NEXT: vsrl.vi v0, v16, 16 +; RV32-NEXT: vor.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v0, 1 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 48 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vsub.vv v0, v8, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v8, v16, a2 +; RV32-NEXT: vor.vv v24, v16, v8 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v24, v0, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v0, v16, a2 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v24, v8 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v24, v24, v8 -; RV32-NEXT: vadd.vv v24, v0, v24 +; RV32-NEXT: vand.vv v16, v0, v8 +; RV32-NEXT: vsrl.vi v0, v0, 2 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vnot.v v16, v16 -; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: vnot.v v24, v24 +; RV32-NEXT: vsrl.vi v0, v24, 1 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vsub.vv v0, v16, v0 -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vsub.vv v24, v24, v0 +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v24, 4 -; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v0, v8 -; RV32-NEXT: vsrl.vi v0, v0, 2 -; RV32-NEXT: vand.vv v8, v0, v8 +; RV32-NEXT: vand.vv v0, v24, v8 +; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v8, v24, v8 +; RV32-NEXT: addi a2, sp, 24 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero +; RV32-NEXT: vlse64.v v24, (a2), zero +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a3), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v0 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll index a5a1061842427..dea0ebfd56946 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -1119,70 +1119,55 @@ declare <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64>, <15 x i1>, i32) define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vand.vv v24, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v24, v0.t ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v24, v16, v0.t +; RV32-NEXT: vadd.vv v16, v24, v16, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1248,26 +1233,28 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a1), zero -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v16, v0 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v8, v8, v0 -; RV32-NEXT: vand.vv v0, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v0, v8 -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 @@ -1318,70 +1305,55 @@ declare <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64>, <16 x i1>, i32) define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vand.vv v24, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v24, v0.t ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v24, v16, v0.t +; RV32-NEXT: vadd.vv v16, v24, v16, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1447,26 +1419,28 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a1), zero -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v16, v0 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v8, v8, v0 -; RV32-NEXT: vand.vv v0, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v0, v8 -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 @@ -1520,17 +1494,18 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 +; RV32-NEXT: vslidedown.vi v24, v0, 2 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 @@ -1542,102 +1517,118 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: sw a2, 36(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: addi a1, a2, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) +; RV32-NEXT: li a2, 16 ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a3, .LBB34_2 +; RV32-NEXT: bltu a0, a2, .LBB34_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB34_2: -; RV32-NEXT: addi a2, sp, 40 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: addi a2, sp, 32 -; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: addi a2, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill @@ -1645,51 +1636,83 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: li a3, 48 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v8, v0.t -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 @@ -1710,9 +1733,9 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: li a2, 16 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: mv a1, a0 ; RV64-NEXT: bltu a0, a2, .LBB34_2 ; RV64-NEXT: # %bb.1: @@ -1792,12 +1815,9 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv8r.v v24, v16 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 @@ -1809,135 +1829,103 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a2, 36(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: addi a1, a2, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) +; RV32-NEXT: li a2, 16 ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a3, .LBB35_2 +; RV32-NEXT: bltu a0, a2, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: addi a2, sp, 40 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: addi a2, a0, -16 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v0, v16 +; RV32-NEXT: vand.vv v24, v24, v8 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vmv8r.v v8, v24 +; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v24, 1 -; RV32-NEXT: vand.vv v16, v24, v16 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v16, v24, v16 +; RV32-NEXT: vand.vv v0, v8, v24 ; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v8, v8, v24 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v16, v0 +; RV32-NEXT: vand.vv v0, v16, v24 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v8, v0 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: addi a2, sp, 48 +; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: addi a2, sp, 24 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v8, v16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a2), zero -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v24, v0 +; RV32-NEXT: vadd.vv v16, v24, v16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v0 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a2), zero +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v24, 4 -; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a2), zero +; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v24, v8 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v24, v0 +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v24, v8, v0 +; RV32-NEXT: vmul.vv v16, v16, v24 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v8, v16, a2 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v24, a2 +; RV32-NEXT: vsrl.vx v16, v16, a2 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 ; RV32-NEXT: addi sp, sp, 48 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll index 4fbe67cfcd642..a39fc835f9d85 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll @@ -167,8 +167,6 @@ define void @ctpop_v2i64(ptr %x, ptr %y) { ; ; RV64-LABEL: ctpop_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: lui a1, 349525 ; RV64-NEXT: lui a2, 209715 ; RV64-NEXT: lui a3, 61681 @@ -185,6 +183,8 @@ define void @ctpop_v2i64(ptr %x, ptr %y) { ; RV64-NEXT: add a3, a3, a5 ; RV64-NEXT: slli a5, a4, 32 ; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: vand.vx v9, v9, a1 ; RV64-NEXT: vsub.vv v8, v8, v9 @@ -473,8 +473,6 @@ define void @ctpop_v4i64(ptr %x, ptr %y) { ; ; RV64-LABEL: ctpop_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: lui a1, 349525 ; RV64-NEXT: lui a2, 209715 ; RV64-NEXT: lui a3, 61681 @@ -491,6 +489,8 @@ define void @ctpop_v4i64(ptr %x, ptr %y) { ; RV64-NEXT: add a3, a3, a5 ; RV64-NEXT: slli a5, a4, 32 ; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: vand.vx v10, v10, a1 ; RV64-NEXT: vsub.vv v8, v8, v10 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index 5f275da1740cb..093ddc36bf7f9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -1263,91 +1263,59 @@ declare <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64>, i1 immarg, <15 x i1>, i32) define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1419,29 +1387,31 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a1), zero -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vsrl.vi v8, v16, 1 -; RV32-NEXT: vand.vv v0, v8, v0 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v16, v16, v0 -; RV32-NEXT: vand.vv v0, v16, v24 -; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v0, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v0 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1495,91 +1465,59 @@ declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32) define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1651,29 +1589,31 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a1), zero -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vsrl.vi v8, v16, 1 -; RV32-NEXT: vand.vv v0, v8, v0 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v16, v16, v0 -; RV32-NEXT: vand.vv v0, v16, v24 -; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v0, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v0 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -1730,18 +1670,17 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 +; RV32-NEXT: vslidedown.vi v24, v0, 2 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 @@ -1753,12 +1692,12 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: sw a2, 36(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a1, 16 -; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a2, 16(sp) ; RV32-NEXT: sw a2, 20(sp) +; RV32-NEXT: li a1, 16 ; RV32-NEXT: mv a2, a0 ; RV32-NEXT: bltu a0, a1, .LBB34_2 ; RV32-NEXT: # %bb.1: @@ -1771,95 +1710,116 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 +; RV32-NEXT: li a5, 48 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 +; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v16, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v24, v16, v24, v0.t -; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -1867,84 +1827,88 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v24, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 @@ -1965,9 +1929,9 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: li a1, 16 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: mv a4, a0 ; RV64-NEXT: bltu a0, a1, .LBB34_2 ; RV64-NEXT: # %bb.1: @@ -2051,45 +2015,41 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: sw a2, 32(sp) -; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: addi a1, a2, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: addi a2, a2, 257 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: sw a2, 4(sp) +; RV32-NEXT: li a2, 16 ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a3, .LBB35_2 +; RV32-NEXT: bltu a0, a2, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: ; RV32-NEXT: li a2, 1 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vnot.v v0, v8 -; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v8, v8, a2 ; RV32-NEXT: vand.vv v8, v0, v8 @@ -2100,59 +2060,50 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: vsub.vv v0, v8, v0 +; RV32-NEXT: vsub.vv v8, v8, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 1 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v0, v8 -; RV32-NEXT: vsrl.vi v0, v0, 2 -; RV32-NEXT: vand.vv v0, v0, v8 -; RV32-NEXT: vadd.vv v24, v24, v0 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v16, v8 +; RV32-NEXT: vand.vv v24, v16, v0 ; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v24, 4 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: mv a3, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a2), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v0, v8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a3), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v16, v0 +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v24, v8, v0 +; RV32-NEXT: vmul.vv v16, v16, v24 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v8, v16, a2 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v24, a2 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vsrl.vx v16, v16, a2 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -3460,91 +3411,59 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext % define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -3616,29 +3535,31 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a1), zero -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vsrl.vi v8, v16, 1 -; RV32-NEXT: vand.vv v0, v8, v0 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v16, v16, v0 -; RV32-NEXT: vand.vv v0, v16, v24 -; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v0, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v0 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3689,92 +3610,60 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v16i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: sw a1, 32(sp) -; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t +; RV32-NEXT: vand.vv v24, v8, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a1), zero -; RV32-NEXT: addi a1, sp, 48 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: addi a1, sp, 24 -; RV32-NEXT: vsub.vv v8, v16, v8, v0.t -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vsub.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t +; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v24, v16, v24, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t +; RV32-NEXT: vadd.vv v16, v16, v24, v0.t ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a1), zero +; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v24, 4, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: li a0, 56 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 48 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -3846,29 +3735,31 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex ; RV32-NEXT: sw a1, 4(sp) ; RV32-NEXT: addi a1, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a1), zero -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vlse64.v v24, (a1), zero +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v16 -; RV32-NEXT: vsrl.vi v8, v16, 1 -; RV32-NEXT: vand.vv v0, v8, v0 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vv v16, v16, v0 -; RV32-NEXT: vand.vv v0, v16, v24 -; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v0, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v0 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 @@ -3923,18 +3814,17 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v7, v0, 2 +; RV32-NEXT: vslidedown.vi v24, v0, 2 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 @@ -3946,12 +3836,12 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: sw a2, 36(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a1, 16 -; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: sw a2, 16(sp) ; RV32-NEXT: sw a2, 20(sp) +; RV32-NEXT: li a1, 16 ; RV32-NEXT: mv a2, a0 ; RV32-NEXT: bltu a0, a1, .LBB70_2 ; RV32-NEXT: # %bb.1: @@ -3964,95 +3854,116 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 +; RV32-NEXT: li a5, 48 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 48 ; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 +; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v16, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v24, v16, v24, v0.t -; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 24 -; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero -; RV32-NEXT: addi a3, sp, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vlse64.v v8, (a4), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v16, v8, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill @@ -4060,84 +3971,88 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v24, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 48 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 +; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 48 @@ -4158,9 +4073,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: li a1, 16 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: mv a4, a0 ; RV64-NEXT: bltu a0, a1, .LBB70_2 ; RV64-NEXT: # %bb.1: @@ -4244,45 +4159,41 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: sw a1, 40(sp) -; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a1, 28(sp) ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: sw a2, 32(sp) -; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: sw a2, 16(sp) +; RV32-NEXT: sw a2, 20(sp) ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: li a3, 16 -; RV32-NEXT: addi a1, a2, 257 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: addi a2, a2, 257 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a2, 0(sp) +; RV32-NEXT: sw a2, 4(sp) +; RV32-NEXT: li a2, 16 ; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a3, .LBB71_2 +; RV32-NEXT: bltu a0, a2, .LBB71_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB71_2: ; RV32-NEXT: li a2, 1 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vnot.v v0, v8 -; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 -; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v8, v8, a2 ; RV32-NEXT: vand.vv v8, v0, v8 @@ -4293,59 +4204,50 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: vsub.vv v0, v8, v0 +; RV32-NEXT: vsub.vv v8, v8, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v16, 1 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsrl.vi v0, v16, 1 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a3), zero -; RV32-NEXT: addi a2, sp, 24 -; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vv v16, v16, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v0, v8 -; RV32-NEXT: vsrl.vi v0, v0, 2 -; RV32-NEXT: vand.vv v0, v0, v8 -; RV32-NEXT: vadd.vv v24, v24, v0 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v16, v8 +; RV32-NEXT: vand.vv v24, v16, v0 ; RV32-NEXT: vsrl.vi v16, v16, 2 -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v24, 4 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: addi a2, sp, 8 +; RV32-NEXT: mv a3, sp +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: addi a4, sp, 48 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a2), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v0, v8 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v0, (a3), zero -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: addi a2, sp, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v16, v0 +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v24, v8, v0 +; RV32-NEXT: vmul.vv v16, v16, v24 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v8, v16, a2 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v24, a2 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: vsrl.vx v16, v16, a2 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll index 57e0eeb92ee2f..ddf92af2312cc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -45,9 +45,9 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vnsrl.wi v10, v12, 23 ; RVF-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVF-NEXT: vnsrl.wi v9, v10, 0 +; RVF-NEXT: vsub.vx v9, v9, a1 ; RVF-NEXT: vmseq.vi v0, v8, 0 -; RVF-NEXT: vsub.vx v8, v9, a1 -; RVF-NEXT: vmerge.vim v8, v8, 8, v0 +; RVF-NEXT: vmerge.vim v8, v9, 8, v0 ; RVF-NEXT: vse8.v v8, (a0) ; RVF-NEXT: ret ; @@ -64,9 +64,9 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vnsrl.wi v10, v12, 23 ; RVD-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVD-NEXT: vnsrl.wi v9, v10, 0 +; RVD-NEXT: vsub.vx v9, v9, a1 ; RVD-NEXT: vmseq.vi v0, v8, 0 -; RVD-NEXT: vsub.vx v8, v9, a1 -; RVD-NEXT: vmerge.vim v8, v8, 8, v0 +; RVD-NEXT: vmerge.vim v8, v9, 8, v0 ; RVD-NEXT: vse8.v v8, (a0) ; RVD-NEXT: ret ; @@ -390,10 +390,10 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind { ; RVI-LABEL: cttz_v32i8: ; RVI: # %bb.0: ; RVI-NEXT: li a1, 32 +; RVI-NEXT: li a2, 1 ; RVI-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; RVI-NEXT: vle8.v v8, (a0) -; RVI-NEXT: li a1, 1 -; RVI-NEXT: vsub.vx v10, v8, a1 +; RVI-NEXT: vsub.vx v10, v8, a2 ; RVI-NEXT: li a1, 85 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v10 @@ -425,9 +425,9 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vnsrl.wi v12, v16, 23 ; RVF-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVF-NEXT: vnsrl.wi v10, v12, 0 +; RVF-NEXT: vsub.vx v10, v10, a1 ; RVF-NEXT: vmseq.vi v0, v8, 0 -; RVF-NEXT: vsub.vx v8, v10, a1 -; RVF-NEXT: vmerge.vim v8, v8, 8, v0 +; RVF-NEXT: vmerge.vim v8, v10, 8, v0 ; RVF-NEXT: vse8.v v8, (a0) ; RVF-NEXT: ret ; @@ -445,9 +445,9 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vnsrl.wi v12, v16, 23 ; RVD-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVD-NEXT: vnsrl.wi v10, v12, 0 +; RVD-NEXT: vsub.vx v10, v10, a1 ; RVD-NEXT: vmseq.vi v0, v8, 0 -; RVD-NEXT: vsub.vx v8, v10, a1 -; RVD-NEXT: vmerge.vim v8, v8, 8, v0 +; RVD-NEXT: vmerge.vim v8, v10, 8, v0 ; RVD-NEXT: vse8.v v8, (a0) ; RVD-NEXT: ret ; @@ -1121,10 +1121,10 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVI-LABEL: cttz_zero_undef_v32i8: ; RVI: # %bb.0: ; RVI-NEXT: li a1, 32 +; RVI-NEXT: li a2, 1 ; RVI-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; RVI-NEXT: vle8.v v8, (a0) -; RVI-NEXT: li a1, 1 -; RVI-NEXT: vsub.vx v10, v8, a1 +; RVI-NEXT: vsub.vx v10, v8, a2 ; RVI-NEXT: li a1, 85 ; RVI-NEXT: vnot.v v8, v8 ; RVI-NEXT: vand.vv v8, v8, v10 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index b4634dbf5a5e8..b611fcd9ddb33 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -22,10 +22,10 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) { ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vadd.vi v12, v11, -16 +; CHECK-NEXT: vadd.vi v11, v11, -15 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v8, 2 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vadd.vi v11, v11, -15 ; CHECK-NEXT: vmerge.vim v13, v10, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll index e13f4f4b50b0f..76e1ae0a69c24 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll @@ -26,26 +26,26 @@ define void @add_v4i32(ptr %x, ptr %y) { define void @add_v2i64(ptr %x, ptr %y) { ; RV32-LABEL: add_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: lw a2, 0(a1) -; RV32-NEXT: lw a3, 4(a1) -; RV32-NEXT: lw a4, 0(a0) -; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: lw a6, 8(a0) -; RV32-NEXT: lw a7, 12(a0) -; RV32-NEXT: lw t0, 12(a1) -; RV32-NEXT: lw a1, 8(a1) -; RV32-NEXT: add a3, a5, a3 -; RV32-NEXT: add a2, a4, a2 -; RV32-NEXT: add a7, a7, t0 -; RV32-NEXT: add a1, a6, a1 -; RV32-NEXT: sltu a4, a2, a4 -; RV32-NEXT: sltu a5, a1, a6 -; RV32-NEXT: add a3, a3, a4 -; RV32-NEXT: add a5, a7, a5 -; RV32-NEXT: sw a2, 0(a0) -; RV32-NEXT: sw a3, 4(a0) -; RV32-NEXT: sw a1, 8(a0) -; RV32-NEXT: sw a5, 12(a0) +; RV32-NEXT: lw a2, 0(a0) +; RV32-NEXT: lw a3, 4(a0) +; RV32-NEXT: lw a4, 8(a0) +; RV32-NEXT: lw a5, 12(a0) +; RV32-NEXT: lw a6, 0(a1) +; RV32-NEXT: lw a7, 4(a1) +; RV32-NEXT: lw t0, 8(a1) +; RV32-NEXT: lw a1, 12(a1) +; RV32-NEXT: add a3, a3, a7 +; RV32-NEXT: add a6, a2, a6 +; RV32-NEXT: add a1, a5, a1 +; RV32-NEXT: add t0, a4, t0 +; RV32-NEXT: sltu a2, a6, a2 +; RV32-NEXT: sltu a4, t0, a4 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: add a1, a1, a4 +; RV32-NEXT: sw a6, 0(a0) +; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: sw t0, 8(a0) +; RV32-NEXT: sw a1, 12(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: add_v2i64: @@ -89,14 +89,14 @@ define void @add_v1i64(ptr %x, ptr %y) { ; RV32: # %bb.0: ; RV32-NEXT: lw a2, 0(a0) ; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: lw a4, 4(a1) -; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: add a3, a3, a4 -; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: sltu a2, a1, a2 -; RV32-NEXT: add a2, a3, a2 -; RV32-NEXT: sw a1, 0(a0) -; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: lw a4, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a4, a2, a4 +; RV32-NEXT: sltu a2, a4, a2 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: sw a4, 0(a0) +; RV32-NEXT: sw a1, 4(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: add_v1i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll index bb2b57fbcc3b7..54489765cff1a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll @@ -308,9 +308,9 @@ define void @truncstore_v2i8_v2i1(<2 x i8> %x, ptr %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmsne.vi v0, v8, 0 -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll index e53876d69b59b..b350268a3c10c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll @@ -10,9 +10,9 @@ define i1 @extractelt_v1i1(ptr %x, i64 %idx) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.s.x v9, zero ; CHECK-NEXT: vmseq.vi v0, v8, 0 -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vslidedown.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -27,9 +27,9 @@ define i1 @extractelt_v2i1(ptr %x, i64 %idx) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmseq.vi v0, v8, 0 -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vslidedown.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -44,9 +44,9 @@ define i1 @extractelt_v4i1(ptr %x, i64 %idx) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vmseq.vi v0, v8, 0 -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vslidedown.vx v8, v8, a1 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -328,13 +328,13 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV32-NEXT: mv a2, sp ; RV32-NEXT: li a3, 128 ; RV32-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; RV32-NEXT: vle8.v v8, (a0) -; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle8.v v16, (a0) +; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: vmseq.vi v0, v8, 0 +; RV32-NEXT: vle8.v v24, (a0) +; RV32-NEXT: vmseq.vi v8, v24, 0 ; RV32-NEXT: vmv.v.i v24, 0 -; RV32-NEXT: vmseq.vi v8, v16, 0 +; RV32-NEXT: vmseq.vi v0, v16, 0 ; RV32-NEXT: vmerge.vim v16, v24, 1, v0 ; RV32-NEXT: vse8.v v16, (a2) ; RV32-NEXT: vmv1r.v v0, v8 @@ -359,13 +359,13 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV64-NEXT: mv a2, sp ; RV64-NEXT: li a3, 128 ; RV64-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; RV64-NEXT: vle8.v v8, (a0) -; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle8.v v16, (a0) +; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: vmseq.vi v0, v8, 0 +; RV64-NEXT: vle8.v v24, (a0) +; RV64-NEXT: vmseq.vi v8, v24, 0 ; RV64-NEXT: vmv.v.i v24, 0 -; RV64-NEXT: vmseq.vi v8, v16, 0 +; RV64-NEXT: vmseq.vi v0, v16, 0 ; RV64-NEXT: vmerge.vim v16, v24, 1, v0 ; RV64-NEXT: vse8.v v16, (a2) ; RV64-NEXT: vmv1r.v v0, v8 @@ -390,13 +390,13 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV32ZBS-NEXT: mv a2, sp ; RV32ZBS-NEXT: li a3, 128 ; RV32ZBS-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; RV32ZBS-NEXT: vle8.v v8, (a0) -; RV32ZBS-NEXT: addi a0, a0, 128 ; RV32ZBS-NEXT: vle8.v v16, (a0) +; RV32ZBS-NEXT: addi a0, a0, 128 ; RV32ZBS-NEXT: add a1, a2, a1 -; RV32ZBS-NEXT: vmseq.vi v0, v8, 0 +; RV32ZBS-NEXT: vle8.v v24, (a0) +; RV32ZBS-NEXT: vmseq.vi v8, v24, 0 ; RV32ZBS-NEXT: vmv.v.i v24, 0 -; RV32ZBS-NEXT: vmseq.vi v8, v16, 0 +; RV32ZBS-NEXT: vmseq.vi v0, v16, 0 ; RV32ZBS-NEXT: vmerge.vim v16, v24, 1, v0 ; RV32ZBS-NEXT: vse8.v v16, (a2) ; RV32ZBS-NEXT: vmv1r.v v0, v8 @@ -421,13 +421,13 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV64ZBS-NEXT: mv a2, sp ; RV64ZBS-NEXT: li a3, 128 ; RV64ZBS-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; RV64ZBS-NEXT: vle8.v v8, (a0) -; RV64ZBS-NEXT: addi a0, a0, 128 ; RV64ZBS-NEXT: vle8.v v16, (a0) +; RV64ZBS-NEXT: addi a0, a0, 128 ; RV64ZBS-NEXT: add a1, a2, a1 -; RV64ZBS-NEXT: vmseq.vi v0, v8, 0 +; RV64ZBS-NEXT: vle8.v v24, (a0) +; RV64ZBS-NEXT: vmseq.vi v8, v24, 0 ; RV64ZBS-NEXT: vmv.v.i v24, 0 -; RV64ZBS-NEXT: vmseq.vi v8, v16, 0 +; RV64ZBS-NEXT: vmseq.vi v0, v16, 0 ; RV64ZBS-NEXT: vmerge.vim v16, v24, 1, v0 ; RV64ZBS-NEXT: vse8.v v16, (a2) ; RV64ZBS-NEXT: vmv1r.v v0, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll index e9dca2c42e835..c7370102be738 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll @@ -560,12 +560,13 @@ define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) { ; VLA-NEXT: vlm.v v0, (a0) ; VLA-NEXT: vmv.v.i v8, 0 ; VLA-NEXT: vmerge.vim v8, v8, 1, v0 +; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; VLA-NEXT: vmv.v.i v9, 0 ; VLA-NEXT: vsetivli zero, 2, e8, m1, ta, ma ; VLA-NEXT: vslidedown.vi v8, v8, 2 ; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; VLA-NEXT: vmsne.vi v0, v8, 0 -; VLA-NEXT: vmv.v.i v8, 0 -; VLA-NEXT: vmerge.vim v8, v8, 1, v0 +; VLA-NEXT: vmerge.vim v8, v9, 1, v0 ; VLA-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; VLA-NEXT: vmv.v.i v9, 0 ; VLA-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -581,12 +582,13 @@ define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) { ; VLS-NEXT: vlm.v v0, (a0) ; VLS-NEXT: vmv.v.i v8, 0 ; VLS-NEXT: vmerge.vim v8, v8, 1, v0 +; VLS-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; VLS-NEXT: vmv.v.i v9, 0 ; VLS-NEXT: vsetivli zero, 2, e8, m1, ta, ma ; VLS-NEXT: vslidedown.vi v8, v8, 2 ; VLS-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; VLS-NEXT: vmsne.vi v0, v8, 0 -; VLS-NEXT: vmv.v.i v8, 0 -; VLS-NEXT: vmerge.vim v8, v8, 1, v0 +; VLS-NEXT: vmerge.vim v8, v9, 1, v0 ; VLS-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; VLS-NEXT: vmv.v.i v9, 0 ; VLS-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -610,12 +612,13 @@ define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) { ; VLA-NEXT: li a0, 42 ; VLA-NEXT: vmv.v.i v8, 0 ; VLA-NEXT: vmerge.vim v8, v8, 1, v0 +; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; VLA-NEXT: vmv.v.i v12, 0 ; VLA-NEXT: vsetivli zero, 2, e8, m4, ta, ma ; VLA-NEXT: vslidedown.vx v8, v8, a0 ; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; VLA-NEXT: vmsne.vi v0, v8, 0 -; VLA-NEXT: vmv.v.i v8, 0 -; VLA-NEXT: vmerge.vim v8, v8, 1, v0 +; VLA-NEXT: vmerge.vim v8, v12, 1, v0 ; VLA-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; VLA-NEXT: vmv.v.i v9, 0 ; VLA-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -631,11 +634,12 @@ define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) { ; VLS-NEXT: vlm.v v0, (a0) ; VLS-NEXT: vmv.v.i v8, 0 ; VLS-NEXT: vmerge.vim v8, v8, 1, v0 -; VLS-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; VLS-NEXT: vslidedown.vi v8, v10, 10 ; VLS-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; VLS-NEXT: vmsne.vi v0, v8, 0 ; VLS-NEXT: vmv.v.i v8, 0 +; VLS-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; VLS-NEXT: vslidedown.vi v9, v10, 10 +; VLS-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; VLS-NEXT: vmsne.vi v0, v9, 0 ; VLS-NEXT: vmerge.vim v8, v8, 1, v0 ; VLS-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; VLS-NEXT: vmv.v.i v9, 0 @@ -676,12 +680,13 @@ define void @extract_v2i1_nxv2i1_2( %x, ptr %y) { ; VLA-NEXT: vsetvli a1, zero, e8, mf4, ta, ma ; VLA-NEXT: vmv.v.i v8, 0 ; VLA-NEXT: vmerge.vim v8, v8, 1, v0 +; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; VLA-NEXT: vmv.v.i v9, 0 ; VLA-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; VLA-NEXT: vslidedown.vi v8, v8, 2 ; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; VLA-NEXT: vmsne.vi v0, v8, 0 -; VLA-NEXT: vmv.v.i v8, 0 -; VLA-NEXT: vmerge.vim v8, v8, 1, v0 +; VLA-NEXT: vmerge.vim v8, v9, 1, v0 ; VLA-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; VLA-NEXT: vmv.v.i v9, 0 ; VLA-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -696,12 +701,13 @@ define void @extract_v2i1_nxv2i1_2( %x, ptr %y) { ; VLS-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; VLS-NEXT: vmv.v.i v8, 0 ; VLS-NEXT: vmerge.vim v8, v8, 1, v0 +; VLS-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; VLS-NEXT: vmv.v.i v9, 0 ; VLS-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; VLS-NEXT: vslidedown.vi v8, v8, 2 ; VLS-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; VLS-NEXT: vmsne.vi v0, v8, 0 -; VLS-NEXT: vmv.v.i v8, 0 -; VLS-NEXT: vmerge.vim v8, v8, 1, v0 +; VLS-NEXT: vmerge.vim v8, v9, 1, v0 ; VLS-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; VLS-NEXT: vmv.v.i v9, 0 ; VLS-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -740,12 +746,13 @@ define void @extract_v2i1_nxv64i1_2( %x, ptr %y) { ; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 2 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vmsne.vi v0, v8, 0 -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -766,12 +773,13 @@ define void @extract_v2i1_nxv64i1_42( %x, ptr %y) { ; VLA-NEXT: vmv.v.i v8, 0 ; VLA-NEXT: li a1, 42 ; VLA-NEXT: vmerge.vim v8, v8, 1, v0 +; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; VLA-NEXT: vmv.v.i v12, 0 ; VLA-NEXT: vsetivli zero, 2, e8, m4, ta, ma ; VLA-NEXT: vslidedown.vx v8, v8, a1 ; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; VLA-NEXT: vmsne.vi v0, v8, 0 -; VLA-NEXT: vmv.v.i v8, 0 -; VLA-NEXT: vmerge.vim v8, v8, 1, v0 +; VLA-NEXT: vmerge.vim v8, v12, 1, v0 ; VLA-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; VLA-NEXT: vmv.v.i v9, 0 ; VLA-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -786,11 +794,12 @@ define void @extract_v2i1_nxv64i1_42( %x, ptr %y) { ; VLS-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; VLS-NEXT: vmv.v.i v8, 0 ; VLS-NEXT: vmerge.vim v8, v8, 1, v0 -; VLS-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; VLS-NEXT: vslidedown.vi v8, v10, 10 ; VLS-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; VLS-NEXT: vmsne.vi v0, v8, 0 ; VLS-NEXT: vmv.v.i v8, 0 +; VLS-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; VLS-NEXT: vslidedown.vi v9, v10, 10 +; VLS-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; VLS-NEXT: vmsne.vi v0, v9, 0 ; VLS-NEXT: vmerge.vim v8, v8, 1, v0 ; VLS-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; VLS-NEXT: vmv.v.i v9, 0 @@ -811,12 +820,13 @@ define void @extract_v2i1_nxv32i1_26( %x, ptr %y) { ; VLA-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; VLA-NEXT: vmv.v.i v8, 0 ; VLA-NEXT: vmerge.vim v8, v8, 1, v0 +; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; VLA-NEXT: vmv.v.i v10, 0 ; VLA-NEXT: vsetivli zero, 2, e8, m2, ta, ma ; VLA-NEXT: vslidedown.vi v8, v8, 26 ; VLA-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; VLA-NEXT: vmsne.vi v0, v8, 0 -; VLA-NEXT: vmv.v.i v8, 0 -; VLA-NEXT: vmerge.vim v8, v8, 1, v0 +; VLA-NEXT: vmerge.vim v8, v10, 1, v0 ; VLA-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; VLA-NEXT: vmv.v.i v9, 0 ; VLA-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -831,11 +841,12 @@ define void @extract_v2i1_nxv32i1_26( %x, ptr %y) { ; VLS-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; VLS-NEXT: vmv.v.i v8, 0 ; VLS-NEXT: vmerge.vim v8, v8, 1, v0 -; VLS-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; VLS-NEXT: vslidedown.vi v8, v9, 10 ; VLS-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; VLS-NEXT: vmsne.vi v0, v8, 0 ; VLS-NEXT: vmv.v.i v8, 0 +; VLS-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; VLS-NEXT: vslidedown.vi v9, v9, 10 +; VLS-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; VLS-NEXT: vmsne.vi v0, v9, 0 ; VLS-NEXT: vmerge.vim v8, v8, 1, v0 ; VLS-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; VLS-NEXT: vmv.v.i v9, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll index 7e45136372b6c..f613449856e09 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -626,11 +626,11 @@ define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind { ; RV32NOM-NEXT: andi a0, a1, 31 ; RV32NOM-NEXT: li a1, 4 ; RV32NOM-NEXT: call __mulsi3 -; RV32NOM-NEXT: li a1, 32 -; RV32NOM-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32NOM-NEXT: vle32.v v8, (s2) ; RV32NOM-NEXT: mv a1, sp +; RV32NOM-NEXT: li a2, 32 ; RV32NOM-NEXT: add a0, a1, a0 +; RV32NOM-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32NOM-NEXT: vle32.v v8, (s2) ; RV32NOM-NEXT: vadd.vv v8, v8, v8 ; RV32NOM-NEXT: vse32.v v8, (a1) ; RV32NOM-NEXT: lw a0, 0(a0) @@ -649,14 +649,14 @@ define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind { ; RV32M-NEXT: addi s0, sp, 256 ; RV32M-NEXT: andi sp, sp, -128 ; RV32M-NEXT: andi a1, a1, 31 -; RV32M-NEXT: li a2, 32 -; RV32M-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32M-NEXT: vle32.v v8, (a0) +; RV32M-NEXT: mv a2, sp +; RV32M-NEXT: li a3, 32 ; RV32M-NEXT: slli a1, a1, 2 -; RV32M-NEXT: mv a0, sp -; RV32M-NEXT: or a1, a0, a1 +; RV32M-NEXT: or a1, a2, a1 +; RV32M-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32M-NEXT: vle32.v v8, (a0) ; RV32M-NEXT: vadd.vv v8, v8, v8 -; RV32M-NEXT: vse32.v v8, (a0) +; RV32M-NEXT: vse32.v v8, (a2) ; RV32M-NEXT: lw a0, 0(a1) ; RV32M-NEXT: addi sp, s0, -256 ; RV32M-NEXT: lw ra, 252(sp) # 4-byte Folded Reload @@ -676,11 +676,11 @@ define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind { ; RV64NOM-NEXT: andi a0, a1, 31 ; RV64NOM-NEXT: li a1, 4 ; RV64NOM-NEXT: call __muldi3 -; RV64NOM-NEXT: li a1, 32 -; RV64NOM-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64NOM-NEXT: vle32.v v8, (s2) ; RV64NOM-NEXT: mv a1, sp +; RV64NOM-NEXT: li a2, 32 ; RV64NOM-NEXT: add a0, a1, a0 +; RV64NOM-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV64NOM-NEXT: vle32.v v8, (s2) ; RV64NOM-NEXT: vadd.vv v8, v8, v8 ; RV64NOM-NEXT: vse32.v v8, (a1) ; RV64NOM-NEXT: lw a0, 0(a0) @@ -699,14 +699,14 @@ define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind { ; RV64M-NEXT: addi s0, sp, 256 ; RV64M-NEXT: andi sp, sp, -128 ; RV64M-NEXT: andi a1, a1, 31 -; RV64M-NEXT: li a2, 32 -; RV64M-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV64M-NEXT: vle32.v v8, (a0) +; RV64M-NEXT: mv a2, sp +; RV64M-NEXT: li a3, 32 ; RV64M-NEXT: slli a1, a1, 2 -; RV64M-NEXT: mv a0, sp -; RV64M-NEXT: or a1, a0, a1 +; RV64M-NEXT: or a1, a2, a1 +; RV64M-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV64M-NEXT: vle32.v v8, (a0) ; RV64M-NEXT: vadd.vv v8, v8, v8 -; RV64M-NEXT: vse32.v v8, (a0) +; RV64M-NEXT: vse32.v v8, (a2) ; RV64M-NEXT: lw a0, 0(a1) ; RV64M-NEXT: addi sp, s0, -256 ; RV64M-NEXT: ld ra, 248(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll index ab2d00b9b9137..c328d5fbe6b0a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll @@ -10,11 +10,11 @@ define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -33,11 +33,11 @@ define <2 x half> @ceil_v2f16(<2 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -56,11 +56,11 @@ define <4 x half> @ceil_v4f16(<4 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -79,11 +79,11 @@ define <8 x half> @ceil_v8f16(<8 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -102,11 +102,11 @@ define <16 x half> @ceil_v16f16(<16 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -128,9 +128,9 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp { ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -151,9 +151,9 @@ define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -174,9 +174,9 @@ define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -197,9 +197,9 @@ define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -220,9 +220,9 @@ define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -243,9 +243,9 @@ define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -264,11 +264,11 @@ define <1 x double> @ceil_v1f64(<1 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -287,11 +287,11 @@ define <2 x double> @ceil_v2f64(<2 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -310,11 +310,11 @@ define <4 x double> @ceil_v4f64(<4 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -333,11 +333,11 @@ define <8 x double> @ceil_v8f64(<8 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll index c6ce7c1bbe8b4..ebb75357cdfe7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll @@ -10,11 +10,11 @@ define <1 x half> @floor_v1f16(<1 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -33,11 +33,11 @@ define <2 x half> @floor_v2f16(<2 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -56,11 +56,11 @@ define <4 x half> @floor_v4f16(<4 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -79,11 +79,11 @@ define <8 x half> @floor_v8f16(<8 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -102,11 +102,11 @@ define <16 x half> @floor_v16f16(<16 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -128,9 +128,9 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp { ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -151,9 +151,9 @@ define <1 x float> @floor_v1f32(<1 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -174,9 +174,9 @@ define <2 x float> @floor_v2f32(<2 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -197,9 +197,9 @@ define <4 x float> @floor_v4f32(<4 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -220,9 +220,9 @@ define <8 x float> @floor_v8f32(<8 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -243,9 +243,9 @@ define <16 x float> @floor_v16f32(<16 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -264,11 +264,11 @@ define <1 x double> @floor_v1f64(<1 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -287,11 +287,11 @@ define <2 x double> @floor_v2f64(<2 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -310,11 +310,11 @@ define <4 x double> @floor_v4f64(<4 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -333,11 +333,11 @@ define <8 x double> @floor_v8f64(<8 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll index d500469003aea..6536021da0313 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll @@ -13,13 +13,13 @@ declare <2 x half> @llvm.vp.floor.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -35,12 +35,12 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v11, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v11, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -59,12 +59,12 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -77,11 +77,11 @@ define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -99,13 +99,13 @@ declare <4 x half> @llvm.vp.floor.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -121,12 +121,12 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v11, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v11, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vmv.v.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -145,12 +145,12 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -163,11 +163,11 @@ define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -185,13 +185,13 @@ declare <8 x half> @llvm.vp.floor.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -207,12 +207,12 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v12, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v12, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v12, v10, v0.t @@ -231,12 +231,12 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -249,11 +249,11 @@ define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -273,12 +273,12 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmv1r.v v10, v0 +; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) -; ZVFH-NEXT: vfabs.v v12, v8, v0.t +; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vmv1r.v v0, v10 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -295,12 +295,12 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v12, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v16, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v12, v0.t @@ -319,12 +319,12 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -337,11 +337,11 @@ define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -363,9 +363,9 @@ define <2 x float> @vp_floor_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -384,8 +384,8 @@ define <2 x float> @vp_floor_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -405,9 +405,9 @@ define <4 x float> @vp_floor_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -426,8 +426,8 @@ define <4 x float> @vp_floor_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -448,9 +448,9 @@ define <8 x float> @vp_floor_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -470,8 +470,8 @@ define <8 x float> @vp_floor_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -492,9 +492,9 @@ define <16 x float> @vp_floor_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -514,8 +514,8 @@ define <16 x float> @vp_floor_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -531,13 +531,13 @@ declare <2 x double> @llvm.vp.floor.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_floor_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI16_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -552,12 +552,12 @@ define <2 x double> @vp_floor_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext % define <2 x double> @vp_floor_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -575,12 +575,12 @@ define <4 x double> @vp_floor_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -596,12 +596,12 @@ define <4 x double> @vp_floor_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext % define <4 x double> @vp_floor_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -619,12 +619,12 @@ define <8 x double> @vp_floor_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -640,12 +640,12 @@ define <8 x double> @vp_floor_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext % define <8 x double> @vp_floor_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI21_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -663,12 +663,12 @@ define <15 x double> @vp_floor_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -684,12 +684,12 @@ define <15 x double> @vp_floor_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroe define <15 x double> @vp_floor_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v15f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI23_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -707,12 +707,12 @@ define <16 x double> @vp_floor_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -728,12 +728,12 @@ define <16 x double> @vp_floor_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroe define <16 x double> @vp_floor_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v16f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI25_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -762,8 +762,8 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -778,33 +778,33 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t -; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll index 4f11e6c3c386a..dc5e2e213f781 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll @@ -59,16 +59,14 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z ; ZVFHMIN-LABEL: vfmax_vv_v2f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -127,16 +125,14 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z ; ZVFHMIN-LABEL: vfmax_vv_v4f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -197,15 +193,13 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z ; ZVFHMIN-LABEL: vfmax_vv_v8f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 +; ZVFHMIN-NEXT: vmerge.vvm v10, v10, v12, v0 ; ZVFHMIN-NEXT: vfmax.vv v10, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 @@ -269,15 +263,13 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i ; ZVFHMIN-LABEL: vfmax_vv_v16f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 -; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 +; ZVFHMIN-NEXT: vmerge.vvm v12, v12, v16, v0 ; ZVFHMIN-NEXT: vfmax.vv v12, v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 @@ -587,7 +579,7 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vmv1r.v v25, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 @@ -601,29 +593,29 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB24_2 +; CHECK-NEXT: bltu a2, a3, .LBB24_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB24_2: ; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: mul a0, a0, a3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v26 ; CHECK-NEXT: vmv8r.v v8, v16 @@ -680,10 +672,10 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmfeq.vv v25, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v16, v8, v16, v0.t +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v24, v8, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll index e17ad303eddb8..eeb9ba155764c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll @@ -24,16 +24,14 @@ define <2 x half> @vfmax_v2f16_vv(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN-LABEL: vfmax_v2f16_vv: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -57,16 +55,14 @@ define <4 x half> @vfmax_v4f16_vv(<4 x half> %a, <4 x half> %b) { ; ZVFHMIN-LABEL: vfmax_v4f16_vv: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -90,15 +86,13 @@ define <8 x half> @vfmax_v8f16_vv(<8 x half> %a, <8 x half> %b) { ; ZVFHMIN-LABEL: vfmax_v8f16_vv: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 +; ZVFHMIN-NEXT: vmerge.vvm v10, v10, v12, v0 ; ZVFHMIN-NEXT: vfmax.vv v10, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 @@ -123,15 +117,13 @@ define <16 x half> @vfmax_v16f16_vv(<16 x half> %a, <16 x half> %b) { ; ZVFHMIN-LABEL: vfmax_v16f16_vv: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 -; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 +; ZVFHMIN-NEXT: vmerge.vvm v12, v12, v16, v0 ; ZVFHMIN-NEXT: vfmax.vv v12, v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 @@ -295,8 +287,8 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { ; ZVFH-LABEL: vfmax_v2f16_vv_nnana: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vfadd.vv v8, v8, v8 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v10, v9, v8, v0 ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 ; ZVFH-NEXT: vmerge.vvm v8, v8, v9, v0 @@ -332,8 +324,8 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) { ; ZVFH-LABEL: vfmax_v2f16_vv_nnanb: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFH-NEXT: vmfeq.vv v0, v8, v8 ; ZVFH-NEXT: vfadd.vv v9, v9, v9 +; ZVFH-NEXT: vmfeq.vv v0, v8, v8 ; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0 ; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll index 2e2103ad5e06d..546aa751c9c73 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll @@ -59,16 +59,14 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z ; ZVFHMIN-LABEL: vfmin_vv_v2f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -127,16 +125,14 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z ; ZVFHMIN-LABEL: vfmin_vv_v4f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -197,15 +193,13 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z ; ZVFHMIN-LABEL: vfmin_vv_v8f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 +; ZVFHMIN-NEXT: vmerge.vvm v10, v10, v12, v0 ; ZVFHMIN-NEXT: vfmin.vv v10, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 @@ -269,15 +263,13 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i ; ZVFHMIN-LABEL: vfmin_vv_v16f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 -; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 +; ZVFHMIN-NEXT: vmerge.vvm v12, v12, v16, v0 ; ZVFHMIN-NEXT: vfmin.vv v12, v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 @@ -587,7 +579,7 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vmv1r.v v25, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 @@ -601,29 +593,29 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB24_2 +; CHECK-NEXT: bltu a2, a3, .LBB24_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB24_2: ; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: mul a0, a0, a3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v26, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v26 ; CHECK-NEXT: vmv8r.v v8, v16 @@ -680,10 +672,10 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmfeq.vv v25, v8, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v16, v8, v16, v0.t +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v24, v8, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll index 1362055c4dabf..196915bf141d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll @@ -24,16 +24,14 @@ define <2 x half> @vfmin_v2f16_vv(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN-LABEL: vfmin_v2f16_vv: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -57,16 +55,14 @@ define <4 x half> @vfmin_v4f16_vv(<4 x half> %a, <4 x half> %b) { ; ZVFHMIN-LABEL: vfmin_v4f16_vv: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -90,15 +86,13 @@ define <8 x half> @vfmin_v8f16_vv(<8 x half> %a, <8 x half> %b) { ; ZVFHMIN-LABEL: vfmin_v8f16_vv: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 +; ZVFHMIN-NEXT: vmerge.vvm v10, v10, v12, v0 ; ZVFHMIN-NEXT: vfmin.vv v10, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 @@ -123,15 +117,13 @@ define <16 x half> @vfmin_v16f16_vv(<16 x half> %a, <16 x half> %b) { ; ZVFHMIN-LABEL: vfmin_v16f16_vv: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 -; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 +; ZVFHMIN-NEXT: vmerge.vvm v12, v12, v16, v0 ; ZVFHMIN-NEXT: vfmin.vv v12, v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 @@ -295,8 +287,8 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { ; ZVFH-LABEL: vfmin_v2f16_vv_nnana: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vfadd.vv v8, v8, v8 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v10, v9, v8, v0 ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 ; ZVFH-NEXT: vmerge.vvm v8, v8, v9, v0 @@ -332,8 +324,8 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) { ; ZVFH-LABEL: vfmin_v2f16_vv_nnanb: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFH-NEXT: vmfeq.vv v0, v8, v8 ; ZVFH-NEXT: vfadd.vv v9, v9, v9 +; ZVFH-NEXT: vmfeq.vv v0, v8, v8 ; ZVFH-NEXT: vmerge.vvm v10, v8, v9, v0 ; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll index 3a7ded1537ef6..f192a053ac888 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll @@ -12,17 +12,17 @@ define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <2 x half> @llvm.experimental.constrained.nearbyint.v2f16(<2 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <2 x half> %r @@ -36,17 +36,17 @@ define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <4 x half> @llvm.experimental.constrained.nearbyint.v4f16(<4 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <4 x half> %r @@ -60,17 +60,17 @@ define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <8 x half> @llvm.experimental.constrained.nearbyint.v8f16(<8 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <8 x half> %r @@ -84,17 +84,17 @@ define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp { ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(<16 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <16 x half> %r @@ -111,15 +111,15 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp { ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <32 x half> @llvm.experimental.constrained.nearbyint.v32f16(<32 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <32 x half> %r @@ -135,15 +135,15 @@ define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <2 x float> %r @@ -159,15 +159,15 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <4 x float> %r @@ -183,15 +183,15 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(<8 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <8 x float> %r @@ -207,15 +207,15 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <16 x float> @llvm.experimental.constrained.nearbyint.v16f32(<16 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <16 x float> %r @@ -229,17 +229,17 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp { ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI9_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI9_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI9_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <2 x double> %r @@ -253,17 +253,17 @@ define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp { ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI10_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI10_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI10_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <4 x double> %r @@ -277,17 +277,17 @@ define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp { ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call <8 x double> @llvm.experimental.constrained.nearbyint.v8f64(<8 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <8 x double> %r diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index e82891f90d85e..4c0186e7d219c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -46,9 +46,11 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, ; CHECK-NEXT: vmadd.vx v14, a0, v12 ; CHECK-NEXT: li a0, 129 ; CHECK-NEXT: vmv.s.x v15, a0 -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vcompress.vm v12, v8, v15 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vrgatherei16.vv v12, v10, v14, v0.t ; CHECK-NEXT: vmv1r.v v8, v12 ; CHECK-NEXT: ret @@ -1749,13 +1751,13 @@ define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v8, fa0 ; CHECK-NEXT: vfmv.v.f v9, fa4 -; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 ; CHECK-NEXT: vfslide1down.vf v9, v9, fa5 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 ; CHECK-NEXT: vfslide1down.vf v9, v9, fa6 ; CHECK-NEXT: vfslide1down.vf v10, v8, fa3 ; CHECK-NEXT: vfslide1down.vf v8, v9, fa7 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslidedown.vi v8, v10, 4, v0.t ; CHECK-NEXT: ret %v0 = insertelement <8 x float> poison, float %e0, i64 0 @@ -1800,13 +1802,13 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d ; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, mu ; CHECK-NEXT: vfmv.v.f v8, fa0 ; CHECK-NEXT: vfmv.v.f v9, fa4 -; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa1 ; CHECK-NEXT: vfslide1down.vf v9, v9, fa5 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa2 ; CHECK-NEXT: vfslide1down.vf v9, v9, fa6 ; CHECK-NEXT: vfslide1down.vf v10, v8, fa3 ; CHECK-NEXT: vfslide1down.vf v8, v9, fa7 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslidedown.vi v8, v10, 4, v0.t ; CHECK-NEXT: ret %v0 = insertelement <8 x double> poison, double %e0, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index 4b09b571b9406..1d02918ac8a9a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -38,15 +38,20 @@ define <4 x float> @interleave_v2f32(<2 x float> %x, <2 x float> %y) { define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) { ; V128-LABEL: interleave_v2f64: ; V128: # %bb.0: +; V128-NEXT: csrr a0, vlenb ; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; V128-NEXT: vmv1r.v v12, v9 -; V128-NEXT: vid.v v9 +; V128-NEXT: vid.v v10 +; V128-NEXT: srli a0, a0, 3 +; V128-NEXT: vsrl.vi v10, v10, 1 +; V128-NEXT: vslidedown.vx v11, v10, a0 +; V128-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; V128-NEXT: vrgatherei16.vv v13, v9, v11 +; V128-NEXT: vrgatherei16.vv v12, v9, v10 +; V128-NEXT: vrgatherei16.vv v15, v8, v11 +; V128-NEXT: vrgatherei16.vv v14, v8, v10 ; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: vsrl.vi v14, v9, 1 -; V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; V128-NEXT: vrgatherei16.vv v10, v8, v14 -; V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t -; V128-NEXT: vmv.v.v v8, v10 +; V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; V128-NEXT: vmerge.vvm v8, v14, v12, v0 ; V128-NEXT: ret ; ; RV32-V512-LABEL: interleave_v2f64: @@ -54,9 +59,9 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) { ; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; RV32-V512-NEXT: vid.v v10 ; RV32-V512-NEXT: vsrl.vi v11, v10, 1 -; RV32-V512-NEXT: vmv.v.i v0, 10 ; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 +; RV32-V512-NEXT: vmv.v.i v0, 10 ; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t ; RV32-V512-NEXT: vmv.v.v v8, v10 ; RV32-V512-NEXT: ret @@ -66,8 +71,8 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) { ; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu ; RV64-V512-NEXT: vid.v v10 ; RV64-V512-NEXT: vsrl.vi v11, v10, 1 -; RV64-V512-NEXT: vmv.v.i v0, 10 ; RV64-V512-NEXT: vrgather.vv v10, v8, v11 +; RV64-V512-NEXT: vmv.v.i v0, 10 ; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t ; RV64-V512-NEXT: vmv.v.v v8, v10 ; RV64-V512-NEXT: ret @@ -253,8 +258,8 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { ; V128-NEXT: vzext.vf2 v8, v24 ; V128-NEXT: addi a1, a1, -1366 ; V128-NEXT: vzext.vf2 v24, v0 -; V128-NEXT: vmv.s.x v0, a1 ; V128-NEXT: vsll.vx v8, v8, a0 +; V128-NEXT: vmv.s.x v0, a1 ; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; V128-NEXT: vmerge.vvm v24, v24, v8, v0 ; V128-NEXT: addi a0, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll index c14eae0b1de61..92374177d93e3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll @@ -17,10 +17,10 @@ define void @fcmp_oeq_vv_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fcmp_oeq_vv_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v10 ; ZVFHMIN-NEXT: vsm.v v8, (a2) @@ -45,10 +45,10 @@ define void @fcmp_oeq_vv_v8f16_nonans(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fcmp_oeq_vv_v8f16_nonans: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v10 ; ZVFHMIN-NEXT: vsm.v v8, (a2) @@ -173,10 +173,10 @@ define void @fcmp_olt_vv_v16f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fcmp_olt_vv_v16f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v10, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v16, v12 ; ZVFHMIN-NEXT: vsm.v v8, (a2) @@ -201,10 +201,10 @@ define void @fcmp_olt_vv_v16f16_nonans(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fcmp_olt_vv_v16f16_nonans: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v10, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v16, v12 ; ZVFHMIN-NEXT: vsm.v v8, (a2) @@ -345,10 +345,10 @@ define void @fcmp_ule_vv_v32f16_nonans(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: li a3, 32 ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v12, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v12, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vmfle.vv v8, v24, v16 ; ZVFHMIN-NEXT: vsm.v v8, (a2) @@ -535,11 +535,11 @@ define void @fcmp_ord_vv_v4f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-LABEL: fcmp_ord_vv_v4f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFH-NEXT: vle16.v v8, (a1) -; ZVFH-NEXT: vle16.v v9, (a0) -; ZVFH-NEXT: vmfeq.vv v8, v8, v8 +; ZVFH-NEXT: vle16.v v8, (a0) +; ZVFH-NEXT: vle16.v v9, (a1) ; ZVFH-NEXT: vmfeq.vv v9, v9, v9 -; ZVFH-NEXT: vmand.mm v0, v9, v8 +; ZVFH-NEXT: vmfeq.vv v8, v8, v8 +; ZVFH-NEXT: vmand.mm v0, v8, v9 ; ZVFH-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; ZVFH-NEXT: vmv.v.i v8, 0 ; ZVFH-NEXT: vmerge.vim v8, v8, 1, v0 @@ -555,14 +555,14 @@ define void @fcmp_ord_vv_v4f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fcmp_ord_vv_v4f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v9, v10, v10 -; ZVFHMIN-NEXT: vmfeq.vv v8, v8, v8 -; ZVFHMIN-NEXT: vmand.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 +; ZVFHMIN-NEXT: vmfeq.vv v9, v9, v9 +; ZVFHMIN-NEXT: vmand.mm v0, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 ; ZVFHMIN-NEXT: vmerge.vim v8, v8, 1, v0 @@ -585,11 +585,11 @@ define void @fcmp_uno_vv_v4f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-LABEL: fcmp_uno_vv_v4f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFH-NEXT: vle16.v v8, (a1) -; ZVFH-NEXT: vle16.v v9, (a0) -; ZVFH-NEXT: vmfne.vv v8, v8, v8 +; ZVFH-NEXT: vle16.v v8, (a0) +; ZVFH-NEXT: vle16.v v9, (a1) ; ZVFH-NEXT: vmfne.vv v9, v9, v9 -; ZVFH-NEXT: vmor.mm v0, v9, v8 +; ZVFH-NEXT: vmfne.vv v8, v8, v8 +; ZVFH-NEXT: vmor.mm v0, v8, v9 ; ZVFH-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; ZVFH-NEXT: vmv.v.i v8, 0 ; ZVFH-NEXT: vmerge.vim v8, v8, 1, v0 @@ -605,14 +605,14 @@ define void @fcmp_uno_vv_v4f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fcmp_uno_vv_v4f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfne.vv v9, v10, v10 -; ZVFHMIN-NEXT: vmfne.vv v8, v8, v8 -; ZVFHMIN-NEXT: vmor.mm v0, v8, v9 +; ZVFHMIN-NEXT: vmfne.vv v8, v10, v10 +; ZVFHMIN-NEXT: vmfne.vv v9, v9, v9 +; ZVFHMIN-NEXT: vmor.mm v0, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; ZVFHMIN-NEXT: vmv.v.i v8, 0 ; ZVFHMIN-NEXT: vmerge.vim v8, v8, 1, v0 @@ -692,12 +692,13 @@ define void @fcmp_oeq_vf_v8f16_nonans(ptr %x, half %y, ptr %z) { define void @fcmp_une_vf_v4f32(ptr %x, float %y, ptr %z) { ; CHECK-LABEL: fcmp_une_vf_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vmfne.vf v0, v8, fa0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma @@ -717,12 +718,13 @@ define void @fcmp_une_vf_v4f32(ptr %x, float %y, ptr %z) { define void @fcmp_une_vf_v4f32_nonans(ptr %x, float %y, ptr %z) { ; CHECK-LABEL: fcmp_une_vf_v4f32_nonans: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vmfne.vf v0, v8, fa0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma @@ -742,12 +744,13 @@ define void @fcmp_une_vf_v4f32_nonans(ptr %x, float %y, ptr %z) { define void @fcmp_ogt_vf_v2f64(ptr %x, double %y, ptr %z) { ; CHECK-LABEL: fcmp_ogt_vf_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vmfgt.vf v0, v8, fa0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -767,12 +770,13 @@ define void @fcmp_ogt_vf_v2f64(ptr %x, double %y, ptr %z) { define void @fcmp_ogt_vf_v2f64_nonans(ptr %x, double %y, ptr %z) { ; CHECK-LABEL: fcmp_ogt_vf_v2f64_nonans: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vmfgt.vf v0, v8, fa0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -1333,12 +1337,13 @@ define void @fcmp_oeq_fv_v8f16_nonans(ptr %x, half %y, ptr %z) { define void @fcmp_une_fv_v4f32(ptr %x, float %y, ptr %z) { ; CHECK-LABEL: fcmp_une_fv_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vmfne.vf v0, v8, fa0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma @@ -1358,12 +1363,13 @@ define void @fcmp_une_fv_v4f32(ptr %x, float %y, ptr %z) { define void @fcmp_une_fv_v4f32_nonans(ptr %x, float %y, ptr %z) { ; CHECK-LABEL: fcmp_une_fv_v4f32_nonans: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vmfne.vf v0, v8, fa0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma @@ -1383,12 +1389,13 @@ define void @fcmp_une_fv_v4f32_nonans(ptr %x, float %y, ptr %z) { define void @fcmp_ogt_fv_v2f64(ptr %x, double %y, ptr %z) { ; CHECK-LABEL: fcmp_ogt_fv_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vmflt.vf v0, v8, fa0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma @@ -1408,12 +1415,13 @@ define void @fcmp_ogt_fv_v2f64(ptr %x, double %y, ptr %z) { define void @fcmp_ogt_fv_v2f64_nonans(ptr %x, double %y, ptr %z) { ; CHECK-LABEL: fcmp_ogt_fv_v2f64_nonans: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vmflt.vf v0, v8, fa0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll index 41d8abb9b73eb..8e288fec53778 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -41,10 +41,10 @@ define <8 x float> @shuffle_v8f32(<8 x float> %x, <8 x float> %y) { define <4 x double> @shuffle_fv_v4f64(<4 x double> %x) { ; CHECK-LABEL: shuffle_fv_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 9 +; CHECK-NEXT: lui a0, %hi(.LCPI3_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; CHECK-NEXT: ret @@ -55,10 +55,10 @@ define <4 x double> @shuffle_fv_v4f64(<4 x double> %x) { define <4 x double> @shuffle_vf_v4f64(<4 x double> %x) { ; CHECK-LABEL: shuffle_vf_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 6 +; CHECK-NEXT: lui a0, %hi(.LCPI4_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0 ; CHECK-NEXT: ret @@ -105,11 +105,12 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI7_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle16.v v14, (a0) -; CHECK-NEXT: vmv.v.i v0, 8 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 8 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vrgather.vi v12, v10, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -120,14 +121,16 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) { ; CHECK-LABEL: vrgather_shuffle_xv_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI8_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI8_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: lui a0, %hi(.LCPI8_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI8_0)(a0) ; CHECK-NEXT: vrsub.vi v12, v10, 4 -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa5 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vrgatherei16.vv v10, v8, v12, v0.t ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -138,16 +141,16 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) { define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) { ; CHECK-LABEL: vrgather_shuffle_vx_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI9_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI9_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v10, 9 +; CHECK-NEXT: lui a0, %hi(.LCPI9_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI9_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vcompress.vm v12, v8, v10 +; CHECK-NEXT: vfmv.v.f v8, fa5 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 3 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmv.v.f v8, fa5 ; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> , <4 x i32> @@ -331,8 +334,8 @@ define <4 x bfloat> @vrgather_shuffle_vv_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y ; CHECK-NEXT: addi a0, a0, %lo(.LCPI25_0) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v11, (a0) -; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret @@ -375,8 +378,8 @@ define <4 x half> @vrgather_shuffle_vv_v4f16(<4 x half> %x, <4 x half> %y) { ; CHECK-NEXT: addi a0, a0, %lo(.LCPI28_0) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v11, (a0) -; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret @@ -399,18 +402,18 @@ define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) { define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) { ; CHECK-LABEL: shuffle_disjoint_lanes: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI30_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0) -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: lui a0, 11 ; CHECK-NEXT: addi a0, a0, -1366 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: lui a0, %hi(.LCPI30_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0) ; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vsext.vf2 v18, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v8, v12, v18 +; CHECK-NEXT: vrgatherei16.vv v8, v12, v16 ; CHECK-NEXT: ret %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> ret <16 x float> %out @@ -437,12 +440,12 @@ define <16 x float> @shuffle_disjoint_lanes_one_broadcast(<16 x float> %v, <16 x ; CHECK-NEXT: lui a0, %hi(.LCPI32_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI32_0) ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vle16.v v20, (a0) +; CHECK-NEXT: vrgather.vi v16, v8, 7 +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, 15 ; CHECK-NEXT: addi a0, a0, 240 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vi v16, v8, 7 -; CHECK-NEXT: vrgatherei16.vv v16, v12, v20, v0.t +; CHECK-NEXT: vrgatherei16.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret %out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> @@ -452,14 +455,14 @@ define <16 x float> @shuffle_disjoint_lanes_one_broadcast(<16 x float> %v, <16 x define <16 x float> @shuffle_disjoint_lanes_one_splat(float %v, <16 x float> %w) { ; CHECK-LABEL: shuffle_disjoint_lanes_one_splat: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vfmv.v.f v12, fa0 ; CHECK-NEXT: lui a0, %hi(.LCPI33_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI33_0) -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: lui a0, 15 ; CHECK-NEXT: addi a0, a0, 240 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vfmv.v.f v12, fa0 ; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll index 58b0a17cdccd6..fed76227a2b69 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll @@ -53,8 +53,8 @@ define void @gather_const_v2f64(ptr %x) { define void @gather_const_v64f16(ptr %x) { ; CHECK-LABEL: gather_const_v64f16: ; CHECK: # %bb.0: -; CHECK-NEXT: flh fa5, 94(a0) ; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: flh fa5, 94(a0) ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vfmv.v.f v8, fa5 ; CHECK-NEXT: vse16.v v8, (a0) @@ -70,8 +70,8 @@ define void @gather_const_v64f16(ptr %x) { define void @gather_const_v32f32(ptr %x) { ; CHECK-LABEL: gather_const_v32f32: ; CHECK: # %bb.0: -; CHECK-NEXT: flw fa5, 68(a0) ; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: flw fa5, 68(a0) ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vfmv.v.f v8, fa5 ; CHECK-NEXT: vse32.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index 585a331e55094..86c727199bbae 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -9,10 +9,10 @@ define void @fadd_v8bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fadd_v8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfadd.vv v8, v12, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -30,10 +30,10 @@ define void @fadd_v6bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fadd_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfadd.vv v8, v12, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -60,10 +60,10 @@ define void @fadd_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fadd_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfadd.vv v8, v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -90,10 +90,10 @@ define void @fadd_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fadd_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfadd.vv v8, v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -143,10 +143,10 @@ define void @fsub_v8bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fsub_v8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfsub.vv v8, v12, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -164,10 +164,10 @@ define void @fsub_v6bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fsub_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfsub.vv v8, v12, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -194,10 +194,10 @@ define void @fsub_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fsub_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfsub.vv v8, v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -224,10 +224,10 @@ define void @fsub_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fsub_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfsub.vv v8, v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -277,10 +277,10 @@ define void @fmul_v8bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fmul_v8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmul.vv v8, v12, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -298,10 +298,10 @@ define void @fmul_v6bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fmul_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmul.vv v8, v12, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -328,10 +328,10 @@ define void @fmul_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fmul_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmul.vv v8, v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -358,10 +358,10 @@ define void @fmul_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fmul_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmul.vv v8, v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -411,10 +411,10 @@ define void @fdiv_v8bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fdiv_v8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfdiv.vv v8, v12, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -432,10 +432,10 @@ define void @fdiv_v6bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fdiv_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfdiv.vv v8, v12, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -462,10 +462,10 @@ define void @fdiv_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fdiv_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfdiv.vv v8, v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -492,10 +492,10 @@ define void @fdiv_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fdiv_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfdiv.vv v8, v12, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -757,13 +757,13 @@ define void @copysign_v8bf16(ptr %x, ptr %y) { ; CHECK-LABEL: copysign_v8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) ; CHECK-NEXT: lui a1, 8 -; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x bfloat>, ptr %x @@ -777,13 +777,13 @@ define void @copysign_v6bf16(ptr %x, ptr %y) { ; CHECK-LABEL: copysign_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) ; CHECK-NEXT: lui a1, 8 -; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x bfloat>, ptr %x @@ -806,13 +806,13 @@ define void @copysign_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: copysign_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: addi a1, a1, -1 ; ZVFHMIN-NEXT: vand.vx v9, v9, a1 -; ZVFHMIN-NEXT: vor.vv v8, v9, v8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x @@ -835,13 +835,13 @@ define void @copysign_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: copysign_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: addi a1, a1, -1 ; ZVFHMIN-NEXT: vand.vx v9, v9, a1 -; ZVFHMIN-NEXT: vor.vv v8, v9, v8 +; ZVFHMIN-NEXT: addi a1, a1, -1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x @@ -1023,14 +1023,14 @@ define void @copysign_neg_v8bf16(ptr %x, ptr %y) { ; CHECK-LABEL: copysign_neg_v8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) ; CHECK-NEXT: lui a1, 8 ; CHECK-NEXT: addi a2, a1, -1 -; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v9, v9, a2 -; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vxor.vx v9, v9, a1 +; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x bfloat>, ptr %x @@ -1045,14 +1045,14 @@ define void @copysign_neg_v6bf16(ptr %x, ptr %y) { ; CHECK-LABEL: copysign_neg_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) ; CHECK-NEXT: lui a1, 8 ; CHECK-NEXT: addi a2, a1, -1 -; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v9, v9, a2 -; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vxor.vx v9, v9, a1 +; CHECK-NEXT: vand.vx v8, v8, a2 +; CHECK-NEXT: vand.vx v9, v9, a1 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x bfloat>, ptr %x @@ -1076,14 +1076,14 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: copysign_neg_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: addi a2, a1, -1 -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v9, v9, a2 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vor.vv v8, v9, v8 +; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a2 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x @@ -1107,14 +1107,14 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: copysign_neg_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: addi a2, a1, -1 -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vand.vx v9, v9, a2 -; ZVFHMIN-NEXT: vand.vx v8, v8, a1 -; ZVFHMIN-NEXT: vor.vv v8, v9, v8 +; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vand.vx v8, v8, a2 +; ZVFHMIN-NEXT: vand.vx v9, v9, a1 +; ZVFHMIN-NEXT: vor.vv v8, v8, v9 ; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x @@ -1211,10 +1211,10 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) { ; ZVFH-LABEL: copysign_neg_trunc_v4f16_v4f32: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFH-NEXT: vle32.v v8, (a1) -; ZVFH-NEXT: vle16.v v9, (a0) -; ZVFH-NEXT: vfncvt.f.f.w v10, v8 -; ZVFH-NEXT: vfsgnjn.vv v8, v9, v10 +; ZVFH-NEXT: vle16.v v8, (a0) +; ZVFH-NEXT: vle32.v v9, (a1) +; ZVFH-NEXT: vfncvt.f.f.w v10, v9 +; ZVFH-NEXT: vfsgnjn.vv v8, v8, v10 ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -1245,10 +1245,10 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFH-LABEL: copysign_neg_trunc_v3f16_v3f32: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 3, e16, mf2, ta, ma -; ZVFH-NEXT: vle32.v v8, (a1) -; ZVFH-NEXT: vle16.v v9, (a0) -; ZVFH-NEXT: vfncvt.f.f.w v10, v8 -; ZVFH-NEXT: vfsgnjn.vv v8, v9, v10 +; ZVFH-NEXT: vle16.v v8, (a0) +; ZVFH-NEXT: vle32.v v9, (a1) +; ZVFH-NEXT: vfncvt.f.f.w v10, v9 +; ZVFH-NEXT: vfsgnjn.vv v8, v8, v10 ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -1279,11 +1279,11 @@ define void @copysign_neg_ext_v2f64_v2f32(ptr %x, ptr %y) { ; CHECK-LABEL: copysign_neg_ext_v2f64_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a1) -; CHECK-NEXT: vle64.v v9, (a0) -; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfsgnjn.vv v8, v9, v10 +; CHECK-NEXT: vfsgnjn.vv v8, v8, v10 ; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x @@ -1417,17 +1417,17 @@ define void @fma_v8bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fma_v8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a2) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vfmadd.vv v14, v10, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 -; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14 +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x bfloat>, ptr %x %b = load <8 x bfloat>, ptr %y @@ -1441,17 +1441,17 @@ define void @fma_v6bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fma_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a2) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v14, v12 +; CHECK-NEXT: vfmadd.vv v14, v10, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 -; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v14 +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x bfloat>, ptr %x %b = load <6 x bfloat>, ptr %y @@ -1475,17 +1475,17 @@ define void @fma_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fma_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a2) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vle16.v v10, (a1) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vle16.v v10, (a2) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v14, v12 +; ZVFHMIN-NEXT: vfmadd.vv v14, v10, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 -; ZVFHMIN-NEXT: vse16.v v10, (a0) +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v14 +; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -1509,17 +1509,17 @@ define void @fma_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fma_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a2) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vle16.v v10, (a1) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vle16.v v10, (a2) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v14, v12 +; ZVFHMIN-NEXT: vfmadd.vv v14, v10, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 -; ZVFHMIN-NEXT: vse16.v v10, (a0) +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v14 +; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y @@ -1569,19 +1569,19 @@ define void @fmsub_v8bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fmsub_v8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a2) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vle16.v v10, (a2) ; CHECK-NEXT: lui a1, 8 -; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vxor.vx v10, v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v12, v14 +; CHECK-NEXT: vfmadd.vv v10, v12, v14 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 -; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x bfloat>, ptr %x %b = load <8 x bfloat>, ptr %y @@ -1596,19 +1596,19 @@ define void @fmsub_v6bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fmsub_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a2) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vle16.v v10, (a2) ; CHECK-NEXT: lui a1, 8 -; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: vxor.vx v10, v10, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v12, v14 +; CHECK-NEXT: vfmadd.vv v10, v12, v14 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 -; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x bfloat>, ptr %x %b = load <6 x bfloat>, ptr %y @@ -1633,19 +1633,19 @@ define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fmsub_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a2) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vle16.v v10, (a2) ; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vxor.vx v10, v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v12, v14 +; ZVFHMIN-NEXT: vfmadd.vv v10, v12, v14 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 -; ZVFHMIN-NEXT: vse16.v v10, (a0) +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -1670,19 +1670,19 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fmsub_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a2) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vle16.v v10, (a2) ; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 +; ZVFHMIN-NEXT: vxor.vx v10, v10, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v12, v14 +; ZVFHMIN-NEXT: vfmadd.vv v10, v12, v14 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 -; ZVFHMIN-NEXT: vse16.v v10, (a0) +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 +; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <6 x half>, ptr %x %b = load <6 x half>, ptr %y @@ -1736,10 +1736,10 @@ define void @fadd_v16bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fadd_v16bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfadd.vv v8, v16, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -1766,10 +1766,10 @@ define void @fadd_v16f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fadd_v16f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v10, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfadd.vv v8, v16, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -1819,10 +1819,10 @@ define void @fsub_v16bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fsub_v16bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfsub.vv v8, v16, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -1849,10 +1849,10 @@ define void @fsub_v16f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fsub_v16f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v10, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfsub.vv v8, v16, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -1902,10 +1902,10 @@ define void @fmul_v16bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fmul_v16bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfmul.vv v8, v16, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -1932,10 +1932,10 @@ define void @fmul_v16f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fmul_v16f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v10, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfmul.vv v8, v16, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -1985,10 +1985,10 @@ define void @fdiv_v16bf16(ptr %x, ptr %y) { ; CHECK-LABEL: fdiv_v16bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfdiv.vv v8, v16, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -2015,10 +2015,10 @@ define void @fdiv_v16f16(ptr %x, ptr %y) { ; ZVFHMIN-LABEL: fdiv_v16f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v10, (a0) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfdiv.vv v8, v16, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -2134,17 +2134,17 @@ define void @fma_v16bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fma_v16bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a2) -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v12, (a1) -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: vle16.v v12, (a2) +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v20, v10 -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v20, v16 +; CHECK-NEXT: vfmadd.vv v20, v12, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v8 -; CHECK-NEXT: vse16.v v12, (a0) +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v20 +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <16 x bfloat>, ptr %x %b = load <16 x bfloat>, ptr %y @@ -2168,17 +2168,17 @@ define void @fma_v16f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fma_v16f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a2) -; ZVFHMIN-NEXT: vle16.v v10, (a0) -; ZVFHMIN-NEXT: vle16.v v12, (a1) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v10, (a1) +; ZVFHMIN-NEXT: vle16.v v12, (a2) +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v20, v16 +; ZVFHMIN-NEXT: vfmadd.vv v20, v12, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v8 -; ZVFHMIN-NEXT: vse16.v v12, (a0) +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v20 +; ZVFHMIN-NEXT: vse16.v v8, (a0) ; ZVFHMIN-NEXT: ret %a = load <16 x half>, ptr %x %b = load <16 x half>, ptr %y @@ -3347,13 +3347,13 @@ define void @fdiv_fv_v2f64(ptr %x, double %y) { define void @fma_vf_v8bf16(ptr %x, ptr %y, bfloat %z) { ; CHECK-LABEL: fma_vf_v8bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a2, fa0 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: fmv.x.w a1, fa0 -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vmv.v.x v10, a2 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmadd.vv v8, v14, v12 @@ -3373,13 +3373,13 @@ define void @fma_vf_v8bf16(ptr %x, ptr %y, bfloat %z) { define void @fma_vf_v6bf16(ptr %x, ptr %y, bfloat %z) { ; CHECK-LABEL: fma_vf_v6bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a2, fa0 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: fmv.x.w a1, fa0 -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vmv.v.x v10, a2 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmadd.vv v8, v14, v12 @@ -3408,13 +3408,13 @@ define void @fma_vf_v8f16(ptr %x, ptr %y, half %z) { ; ; ZVFHMIN-LABEL: fma_vf_v8f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: fmv.x.w a2, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v8, v14, v12 @@ -3443,13 +3443,13 @@ define void @fma_vf_v6f16(ptr %x, ptr %y, half %z) { ; ; ZVFHMIN-LABEL: fma_vf_v6f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: fmv.x.w a2, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v8, v14, v12 @@ -3505,13 +3505,13 @@ define void @fma_vf_v2f64(ptr %x, ptr %y, double %z) { define void @fma_fv_v8bf16(ptr %x, ptr %y, bfloat %z) { ; CHECK-LABEL: fma_fv_v8bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a2, fa0 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: fmv.x.w a1, fa0 -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vmv.v.x v10, a2 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmadd.vv v8, v14, v12 @@ -3531,13 +3531,13 @@ define void @fma_fv_v8bf16(ptr %x, ptr %y, bfloat %z) { define void @fma_fv_v6bf16(ptr %x, ptr %y, bfloat %z) { ; CHECK-LABEL: fma_fv_v6bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a2, fa0 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: fmv.x.w a1, fa0 -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vmv.v.x v10, a2 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmadd.vv v8, v14, v12 @@ -3566,13 +3566,13 @@ define void @fma_fv_v8f16(ptr %x, ptr %y, half %z) { ; ; ZVFHMIN-LABEL: fma_fv_v8f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: fmv.x.w a2, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v8, v14, v12 @@ -3601,13 +3601,13 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) { ; ; ZVFHMIN-LABEL: fma_fv_v6f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: fmv.x.w a2, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) -; ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v10, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) +; ZVFHMIN-NEXT: vmv.v.x v10, a2 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v8, v14, v12 @@ -3665,13 +3665,13 @@ define void @fmsub_vf_v8bf16(ptr %x, ptr %y, bfloat %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.w a2, fa0 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) ; CHECK-NEXT: lui a1, 8 ; CHECK-NEXT: vmv.v.x v10, a2 -; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 +; CHECK-NEXT: vxor.vx v9, v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmadd.vv v8, v12, v14 @@ -3694,13 +3694,13 @@ define void @fmsub_vf_v6bf16(ptr %x, ptr %y, bfloat %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.x.w a2, fa0 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) ; CHECK-NEXT: lui a1, 8 ; CHECK-NEXT: vmv.v.x v10, a2 -; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 +; CHECK-NEXT: vxor.vx v9, v9, a1 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmadd.vv v8, v12, v14 @@ -3732,13 +3732,13 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.w a2, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vmv.v.x v10, a2 -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 +; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v8, v12, v14 @@ -3770,13 +3770,13 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: fmv.x.w a2, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vmv.v.x v10, a2 -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 +; ZVFHMIN-NEXT: vxor.vx v9, v9, a1 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmadd.vv v8, v12, v14 @@ -4057,11 +4057,11 @@ define void @ceil_v8bf16(ptr %x) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -4084,12 +4084,12 @@ define void @ceil_v6bf16(ptr %x) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a1 @@ -4113,9 +4113,9 @@ define void @ceil_v8f16(ptr %x) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: lui a1, %hi(.LCPI177_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI177_0)(a1) +; ZVFH-NEXT: fsrmi a1, 3 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 -; ZVFH-NEXT: fsrmi a1, 3 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a1 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4130,11 +4130,11 @@ define void @ceil_v8f16(ptr %x) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: fsrmi a1, 3 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a1, 3 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a1 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -4157,10 +4157,10 @@ define void @ceil_v6f16(ptr %x) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: lui a1, %hi(.LCPI178_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI178_0)(a1) +; ZVFH-NEXT: fsrmi a1, 3 ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 -; ZVFH-NEXT: fsrmi a1, 3 ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a1 @@ -4176,12 +4176,12 @@ define void @ceil_v6f16(ptr %x) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: fsrmi a1, 3 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a1, 3 ; ZVFHMIN-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a1 @@ -4205,9 +4205,9 @@ define void @ceil_v4f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4228,9 +4228,9 @@ define void @ceil_v2f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: lui a1, %hi(.LCPI180_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI180_0)(a1) +; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a1, 3 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4251,11 +4251,11 @@ define void @floor_v8bf16(ptr %x) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -4278,12 +4278,12 @@ define void @floor_v6bf16(ptr %x) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a1 @@ -4307,9 +4307,9 @@ define void @floor_v8f16(ptr %x) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: lui a1, %hi(.LCPI183_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI183_0)(a1) +; ZVFH-NEXT: fsrmi a1, 2 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 -; ZVFH-NEXT: fsrmi a1, 2 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a1 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4324,11 +4324,11 @@ define void @floor_v8f16(ptr %x) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: fsrmi a1, 2 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a1, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a1 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -4351,10 +4351,10 @@ define void @floor_v6f16(ptr %x) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: lui a1, %hi(.LCPI184_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI184_0)(a1) +; ZVFH-NEXT: fsrmi a1, 2 ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 -; ZVFH-NEXT: fsrmi a1, 2 ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a1 @@ -4370,12 +4370,12 @@ define void @floor_v6f16(ptr %x) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: fsrmi a1, 2 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a1, 2 ; ZVFHMIN-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a1 @@ -4399,9 +4399,9 @@ define void @floor_v4f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4422,9 +4422,9 @@ define void @floor_v2f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: lui a1, %hi(.LCPI186_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI186_0)(a1) +; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a1, 2 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4445,11 +4445,11 @@ define void @round_v8bf16(ptr %x) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -4472,12 +4472,12 @@ define void @round_v6bf16(ptr %x) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a1 @@ -4501,9 +4501,9 @@ define void @round_v8f16(ptr %x) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: lui a1, %hi(.LCPI189_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI189_0)(a1) +; ZVFH-NEXT: fsrmi a1, 4 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 -; ZVFH-NEXT: fsrmi a1, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a1 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4518,11 +4518,11 @@ define void @round_v8f16(ptr %x) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: fsrmi a1, 4 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a1, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a1 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -4545,10 +4545,10 @@ define void @round_v6f16(ptr %x) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: lui a1, %hi(.LCPI190_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI190_0)(a1) +; ZVFH-NEXT: fsrmi a1, 4 ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 -; ZVFH-NEXT: fsrmi a1, 4 ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a1 @@ -4564,12 +4564,12 @@ define void @round_v6f16(ptr %x) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: fsrmi a1, 4 ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a1, 4 ; ZVFHMIN-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a1 @@ -4593,9 +4593,9 @@ define void @round_v4f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4616,9 +4616,9 @@ define void @round_v2f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: lui a1, %hi(.LCPI192_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI192_0)(a1) +; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -4746,11 +4746,11 @@ define void @nearbyint_v8bf16(ptr %x) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: fsflags a1 @@ -4773,9 +4773,9 @@ define void @nearbyint_v8f16(ptr %x) { ; ZVFH-NEXT: vle16.v v8, (a0) ; ZVFH-NEXT: lui a1, %hi(.LCPI198_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI198_0)(a1) +; ZVFH-NEXT: frflags a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 -; ZVFH-NEXT: frflags a1 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t ; ZVFH-NEXT: fsflags a1 @@ -4790,11 +4790,11 @@ define void @nearbyint_v8f16(ptr %x) { ; ZVFHMIN-NEXT: vle16.v v8, (a0) ; ZVFHMIN-NEXT: lui a1, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: frflags a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t ; ZVFHMIN-NEXT: fsflags a1 @@ -4817,9 +4817,9 @@ define void @nearbyint_v4f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a1, 307200 ; CHECK-NEXT: fmv.w.x fa5, a1 +; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: fsflags a1 @@ -4840,9 +4840,9 @@ define void @nearbyint_v2f64(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: lui a1, %hi(.LCPI200_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI200_0)(a1) +; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: fsflags a1 @@ -4860,11 +4860,11 @@ define void @fmuladd_v8bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fmuladd_v8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) ; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmul.vv v8, v14, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -4889,11 +4889,11 @@ define void @fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fmuladd_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) ; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmul.vv v8, v14, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -4928,11 +4928,11 @@ define void @fmuladd_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fmuladd_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: vle16.v v10, (a2) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmul.vv v8, v14, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -4967,11 +4967,11 @@ define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fmuladd_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: vle16.v v10, (a2) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmul.vv v8, v14, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -5032,11 +5032,11 @@ define void @fmsub_fmuladd_v8bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fmsub_fmuladd_v8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) ; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmul.vv v8, v14, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -5062,11 +5062,11 @@ define void @fmsub_fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fmsub_fmuladd_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a1) -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) ; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v14, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfmul.vv v8, v14, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -5102,11 +5102,11 @@ define void @fmsub_fmuladd_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fmsub_fmuladd_v8f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: vle16.v v10, (a2) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmul.vv v8, v14, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -5142,11 +5142,11 @@ define void @fmsub_fmuladd_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFHMIN-LABEL: fmsub_fmuladd_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v8, (a1) -; ZVFHMIN-NEXT: vle16.v v9, (a0) +; ZVFHMIN-NEXT: vle16.v v8, (a0) +; ZVFHMIN-NEXT: vle16.v v9, (a1) ; ZVFHMIN-NEXT: vle16.v v10, (a2) -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfmul.vv v8, v14, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll index a1466d46f1ba7..5106ec1189327 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll @@ -96,9 +96,9 @@ declare <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float>, <32 x i1>, i32) define <32 x double> @vfpext_v32f32_v32f64(<32 x float> %a, <32 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vfpext_v32f32_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v16, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB7_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll index c6b8b602718b7..c18d8639dc91c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpowi.ll @@ -743,8 +743,8 @@ define <16 x float> @powi_v16f32(<16 x float> %x, i32 %y) nounwind { ; RV64-NEXT: addi a1, sp, 64 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vse32.v v8, (a1) -; RV64-NEXT: flw fa0, 124(sp) ; RV64-NEXT: sext.w s2, a0 +; RV64-NEXT: flw fa0, 124(sp) ; RV64-NEXT: mv a0, s2 ; RV64-NEXT: call __powisf2 ; RV64-NEXT: fsw fa0, 188(sp) @@ -1188,8 +1188,8 @@ define <8 x double> @powi_v8f64(<8 x double> %x, i32 %y) nounwind { ; RV64-NEXT: addi a1, sp, 64 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vse64.v v8, (a1) -; RV64-NEXT: fld fa0, 120(sp) ; RV64-NEXT: sext.w s2, a0 +; RV64-NEXT: fld fa0, 120(sp) ; RV64-NEXT: mv a0, s2 ; RV64-NEXT: call __powidf2 ; RV64-NEXT: fsd fa0, 184(sp) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll index f6c992280c6e3..e4609f1e9313d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll @@ -394,9 +394,9 @@ declare <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double>, <32 x i1>, i32) define <32 x i64> @vfptosi_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfptosi_v32i64_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB25_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll index af225f4d95aa2..846675cf5a9b4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll @@ -394,9 +394,9 @@ declare <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double>, <32 x i1>, i32) define <32 x i64> @vfptoui_v32i64_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfptoui_v32i64_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB25_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll index 582706e4dfa18..ae53abc3f8c9a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll @@ -99,8 +99,8 @@ define <32 x float> @vfptrunc_v32f32_v32f64(<32 x double> %a, <32 x i1> %m, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vmv8r.v v24, v8 -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v12, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB7_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll index be32c033fe373..751a6e45c0c3c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll @@ -12,11 +12,11 @@ define <1 x half> @round_v1f16(<1 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -35,11 +35,11 @@ define <2 x half> @round_v2f16(<2 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -58,11 +58,11 @@ define <4 x half> @round_v4f16(<4 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -81,11 +81,11 @@ define <8 x half> @round_v8f16(<8 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -104,11 +104,11 @@ define <16 x half> @round_v16f16(<16 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -130,9 +130,9 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp { ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -153,9 +153,9 @@ define <1 x float> @round_v1f32(<1 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -176,9 +176,9 @@ define <2 x float> @round_v2f32(<2 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -199,9 +199,9 @@ define <4 x float> @round_v4f32(<4 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -222,9 +222,9 @@ define <8 x float> @round_v8f32(<8 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -245,9 +245,9 @@ define <16 x float> @round_v16f32(<16 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -266,11 +266,11 @@ define <1 x double> @round_v1f64(<1 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -289,11 +289,11 @@ define <2 x double> @round_v2f64(<2 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -312,11 +312,11 @@ define <4 x double> @round_v4f64(<4 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -335,11 +335,11 @@ define <8 x double> @round_v8f64(<8 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll index 774ce5c7859c9..2bf3e9596597d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll @@ -13,12 +13,12 @@ define <1 x half> @round_v1f16(<1 x half> %x) { ; ZVFH-LABEL: round_v1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; ZVFH-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -31,11 +31,11 @@ define <1 x half> @round_v1f16(<1 x half> %x) { ; ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -52,12 +52,12 @@ declare <1 x half> @llvm.round.v1f16(<1 x half>) define <2 x half> @round_v2f16(<2 x half> %x) { ; ZVFH-LABEL: round_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -70,11 +70,11 @@ define <2 x half> @round_v2f16(<2 x half> %x) { ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -91,12 +91,12 @@ declare <2 x half> @llvm.round.v2f16(<2 x half>) define <4 x half> @round_v4f16(<4 x half> %x) { ; ZVFH-LABEL: round_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -109,11 +109,11 @@ define <4 x half> @round_v4f16(<4 x half> %x) { ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -130,12 +130,12 @@ declare <4 x half> @llvm.round.v4f16(<4 x half>) define <8 x half> @round_v8f16(<8 x half> %x) { ; ZVFH-LABEL: round_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -148,11 +148,11 @@ define <8 x half> @round_v8f16(<8 x half> %x) { ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -169,12 +169,12 @@ declare <8 x half> @llvm.round.v8f16(<8 x half>) define <16 x half> @round_v16f16(<16 x half> %x) { ; ZVFH-LABEL: round_v16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; ZVFH-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -187,11 +187,11 @@ define <16 x half> @round_v16f16(<16 x half> %x) { ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -208,15 +208,15 @@ declare <16 x half> @llvm.round.v16f16(<16 x half>) define <32 x half> @round_v32f16(<32 x half> %x) { ; ZVFH-LABEL: round_v32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; ZVFH-NEXT: li a0, 32 +; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; ZVFH-NEXT: fsrmi a1, 4 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 -; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t -; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: fsrm a1 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t @@ -226,15 +226,15 @@ define <32 x half> @round_v32f16(<32 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: li a0, 32 ; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: fsrmi a1, 4 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t -; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: fsrm a1 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t @@ -253,8 +253,8 @@ define <1 x float> @round_v1f32(<1 x float> %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -273,8 +273,8 @@ define <2 x float> @round_v2f32(<2 x float> %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -293,8 +293,8 @@ define <4 x float> @round_v4f32(<4 x float> %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -313,8 +313,8 @@ define <8 x float> @round_v8f32(<8 x float> %x) { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -333,8 +333,8 @@ define <16 x float> @round_v16f32(<16 x float> %x) { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -349,12 +349,12 @@ declare <16 x float> @llvm.round.v16f32(<16 x float>) define <1 x double> @round_v1f64(<1 x double> %x) { ; CHECK-LABEL: round_v1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI11_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -369,12 +369,12 @@ declare <1 x double> @llvm.round.v1f64(<1 x double>) define <2 x double> @round_v2f64(<2 x double> %x) { ; CHECK-LABEL: round_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI12_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -389,12 +389,12 @@ declare <2 x double> @llvm.round.v2f64(<2 x double>) define <4 x double> @round_v4f64(<4 x double> %x) { ; CHECK-LABEL: round_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI13_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -409,12 +409,12 @@ declare <4 x double> @llvm.round.v4f64(<4 x double>) define <8 x double> @round_v8f64(<8 x double> %x) { ; CHECK-LABEL: round_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI14_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll index 5c0279e133dfa..c61e707bd89f0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll @@ -12,11 +12,11 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -35,11 +35,11 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -58,11 +58,11 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -81,11 +81,11 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -104,11 +104,11 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -130,9 +130,9 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp { ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -153,9 +153,9 @@ define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -176,9 +176,9 @@ define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -199,9 +199,9 @@ define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -222,9 +222,9 @@ define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -245,9 +245,9 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -266,11 +266,11 @@ define <1 x double> @roundeven_v1f64(<1 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -289,11 +289,11 @@ define <2 x double> @roundeven_v2f64(<2 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -312,11 +312,11 @@ define <4 x double> @roundeven_v4f64(<4 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -335,11 +335,11 @@ define <8 x double> @roundeven_v8f64(<8 x double> %x) strictfp { ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll index 0b6baad127643..697fc657af5d1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll @@ -13,12 +13,12 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) { ; ZVFH-LABEL: roundeven_v1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; ZVFH-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -31,11 +31,11 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) { ; ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -52,12 +52,12 @@ declare <1 x half> @llvm.roundeven.v1f16(<1 x half>) define <2 x half> @roundeven_v2f16(<2 x half> %x) { ; ZVFH-LABEL: roundeven_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -70,11 +70,11 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) { ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -91,12 +91,12 @@ declare <2 x half> @llvm.roundeven.v2f16(<2 x half>) define <4 x half> @roundeven_v4f16(<4 x half> %x) { ; ZVFH-LABEL: roundeven_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -109,11 +109,11 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) { ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -130,12 +130,12 @@ declare <4 x half> @llvm.roundeven.v4f16(<4 x half>) define <8 x half> @roundeven_v8f16(<8 x half> %x) { ; ZVFH-LABEL: roundeven_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -148,11 +148,11 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) { ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -169,12 +169,12 @@ declare <8 x half> @llvm.roundeven.v8f16(<8 x half>) define <16 x half> @roundeven_v16f16(<16 x half> %x) { ; ZVFH-LABEL: roundeven_v16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; ZVFH-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -187,11 +187,11 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) { ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -208,15 +208,15 @@ declare <16 x half> @llvm.roundeven.v16f16(<16 x half>) define <32 x half> @roundeven_v32f16(<32 x half> %x) { ; ZVFH-LABEL: roundeven_v32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; ZVFH-NEXT: li a0, 32 +; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; ZVFH-NEXT: fsrmi a1, 0 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 -; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t -; ZVFH-NEXT: fsrm a0 +; ZVFH-NEXT: fsrm a1 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t @@ -226,15 +226,15 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: li a0, 32 ; ZVFHMIN-NEXT: lui a1, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a1 +; ZVFHMIN-NEXT: fsrmi a1, 0 ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: fmv.w.x fa5, a1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t -; ZVFHMIN-NEXT: fsrm a0 +; ZVFHMIN-NEXT: fsrm a1 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t @@ -253,8 +253,8 @@ define <1 x float> @roundeven_v1f32(<1 x float> %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -273,8 +273,8 @@ define <2 x float> @roundeven_v2f32(<2 x float> %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -293,8 +293,8 @@ define <4 x float> @roundeven_v4f32(<4 x float> %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -313,8 +313,8 @@ define <8 x float> @roundeven_v8f32(<8 x float> %x) { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -333,8 +333,8 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -349,12 +349,12 @@ declare <16 x float> @llvm.roundeven.v16f32(<16 x float>) define <1 x double> @roundeven_v1f64(<1 x double> %x) { ; CHECK-LABEL: roundeven_v1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI11_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -369,12 +369,12 @@ declare <1 x double> @llvm.roundeven.v1f64(<1 x double>) define <2 x double> @roundeven_v2f64(<2 x double> %x) { ; CHECK-LABEL: roundeven_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI12_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -389,12 +389,12 @@ declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) define <4 x double> @roundeven_v4f64(<4 x double> %x) { ; CHECK-LABEL: roundeven_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI13_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -409,12 +409,12 @@ declare <4 x double> @llvm.roundeven.v4f64(<4 x double>) define <8 x double> @roundeven_v8f64(<8 x double> %x) { ; CHECK-LABEL: roundeven_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI14_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll index 62e7e3b109902..82d740d3113eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -285,14 +285,14 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) { define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) { ; VLA-LABEL: insert_v8i32_v2i32_2: ; VLA: # %bb.0: -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vle32.v v8, (a0) ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLA-NEXT: vle32.v v10, (a1) +; VLA-NEXT: vle32.v v8, (a1) +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vle32.v v10, (a0) ; VLA-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; VLA-NEXT: vslideup.vi v8, v10, 2 +; VLA-NEXT: vslideup.vi v10, v8, 2 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vse32.v v8, (a0) +; VLA-NEXT: vse32.v v10, (a0) ; VLA-NEXT: ret ; ; VLS-LABEL: insert_v8i32_v2i32_2: @@ -314,13 +314,12 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) { define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) { ; VLA-LABEL: insert_v8i32_v2i32_6: ; VLA: # %bb.0: -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vle32.v v8, (a0) ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLA-NEXT: vle32.v v10, (a1) +; VLA-NEXT: vle32.v v8, (a1) ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vslideup.vi v8, v10, 6 -; VLA-NEXT: vse32.v v8, (a0) +; VLA-NEXT: vle32.v v10, (a0) +; VLA-NEXT: vslideup.vi v10, v8, 6 +; VLA-NEXT: vse32.v v10, (a0) ; VLA-NEXT: ret ; ; VLS-LABEL: insert_v8i32_v2i32_6: @@ -830,13 +829,13 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) { ; RV32VLS-NEXT: vl1re64.v v8, (a0) ; RV32VLS-NEXT: addi a0, sp, 128 ; RV32VLS-NEXT: vs1r.v v8, (a0) -; RV32VLS-NEXT: addi a0, sp, 192 -; RV32VLS-NEXT: vl8re64.v v8, (a0) ; RV32VLS-NEXT: addi a0, sp, 64 +; RV32VLS-NEXT: vl8re64.v v8, (a0) +; RV32VLS-NEXT: addi a0, sp, 192 ; RV32VLS-NEXT: vl8re64.v v16, (a0) ; RV32VLS-NEXT: addi a0, a1, 128 -; RV32VLS-NEXT: vs8r.v v8, (a0) -; RV32VLS-NEXT: vs8r.v v16, (a1) +; RV32VLS-NEXT: vs8r.v v16, (a0) +; RV32VLS-NEXT: vs8r.v v8, (a1) ; RV32VLS-NEXT: addi sp, s0, -80 ; RV32VLS-NEXT: .cfi_def_cfa sp, 80 ; RV32VLS-NEXT: lw ra, 76(sp) # 4-byte Folded Reload @@ -862,13 +861,13 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) { ; RV64VLS-NEXT: vl1re64.v v8, (a0) ; RV64VLS-NEXT: addi a0, sp, 128 ; RV64VLS-NEXT: vs1r.v v8, (a0) -; RV64VLS-NEXT: addi a0, sp, 192 -; RV64VLS-NEXT: vl8re64.v v8, (a0) ; RV64VLS-NEXT: addi a0, sp, 64 +; RV64VLS-NEXT: vl8re64.v v8, (a0) +; RV64VLS-NEXT: addi a0, sp, 192 ; RV64VLS-NEXT: vl8re64.v v16, (a0) ; RV64VLS-NEXT: addi a0, a1, 128 -; RV64VLS-NEXT: vs8r.v v8, (a0) -; RV64VLS-NEXT: vs8r.v v16, (a1) +; RV64VLS-NEXT: vs8r.v v16, (a0) +; RV64VLS-NEXT: vs8r.v v8, (a1) ; RV64VLS-NEXT: addi sp, s0, -80 ; RV64VLS-NEXT: .cfi_def_cfa sp, 80 ; RV64VLS-NEXT: ld ra, 72(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll index 6782b2003ba94..ae0736682c9dd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -542,11 +542,11 @@ define void @insertelt_c6_v8i64_0_add(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle64.v v12, (a1) -; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: li a2, 6 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma -; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vle64.v v12, (a1) ; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index c628a0d620498..b8e299e67fc04 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -530,10 +530,10 @@ define void @buildvec_dominant0_v2i32(ptr %x) { ; ; RV64V-LABEL: buildvec_dominant0_v2i32: ; RV64V: # %bb.0: -; RV64V-NEXT: lui a1, %hi(.LCPI40_0) -; RV64V-NEXT: ld a1, %lo(.LCPI40_0)(a1) ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64V-NEXT: vmv.v.i v8, -1 +; RV64V-NEXT: lui a1, %hi(.LCPI40_0) +; RV64V-NEXT: ld a1, %lo(.LCPI40_0)(a1) ; RV64V-NEXT: vsetvli zero, zero, e64, m1, tu, ma ; RV64V-NEXT: vmv.s.x v8, a1 ; RV64V-NEXT: vse64.v v8, (a0) @@ -698,15 +698,16 @@ define void @buildvec_seq_v9i8(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 73 ; CHECK-NEXT: vsetivli zero, 9, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v9, 3 +; CHECK-NEXT: vmv.v.i v8, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: li a1, 146 -; CHECK-NEXT: vmv.s.x v8, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v9, 2, v0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vmerge.vim v8, v8, 2, v0 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret store <9 x i8> , ptr %x @@ -935,11 +936,13 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_1() vscale_range(16, ; RV32-NEXT: li a0, 512 ; RV32-NEXT: vsetivli zero, 16, e32, mf2, ta, ma ; RV32-NEXT: vid.v v8 +; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV32-NEXT: vmv.v.i v12, 1 +; RV32-NEXT: vsetivli zero, 16, e32, mf2, ta, ma ; RV32-NEXT: vsrl.vi v8, v8, 3 ; RV32-NEXT: vadd.vi v0, v8, -1 ; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; RV32-NEXT: vmv.v.i v8, 1 -; RV32-NEXT: vmerge.vim v8, v8, 0, v0 +; RV32-NEXT: vmerge.vim v8, v12, 0, v0 ; RV32-NEXT: ret ; ; RV64V-LABEL: buildvec_not_vid_v512i8_indices_overflow_1: @@ -947,11 +950,13 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_1() vscale_range(16, ; RV64V-NEXT: li a0, 512 ; RV64V-NEXT: vsetivli zero, 8, e64, m1, ta, ma ; RV64V-NEXT: vid.v v8 +; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV64V-NEXT: vmv.v.i v12, 1 +; RV64V-NEXT: vsetivli zero, 8, e64, m1, ta, ma ; RV64V-NEXT: vsrl.vi v8, v8, 2 ; RV64V-NEXT: vadd.vi v0, v8, -1 ; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; RV64V-NEXT: vmv.v.i v8, 1 -; RV64V-NEXT: vmerge.vim v8, v8, 0, v0 +; RV64V-NEXT: vmerge.vim v8, v12, 0, v0 ; RV64V-NEXT: ret ; ; RV64ZVE32-LABEL: buildvec_not_vid_v512i8_indices_overflow_1: @@ -959,11 +964,13 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_1() vscale_range(16, ; RV64ZVE32-NEXT: li a0, 512 ; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m1, ta, ma ; RV64ZVE32-NEXT: vid.v v8 +; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV64ZVE32-NEXT: vmv.v.i v12, 1 +; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m1, ta, ma ; RV64ZVE32-NEXT: vsrl.vi v8, v8, 3 ; RV64ZVE32-NEXT: vadd.vi v0, v8, -1 ; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; RV64ZVE32-NEXT: vmv.v.i v8, 1 -; RV64ZVE32-NEXT: vmerge.vim v8, v8, 0, v0 +; RV64ZVE32-NEXT: vmerge.vim v8, v12, 0, v0 ; RV64ZVE32-NEXT: ret ret <512 x i8> } @@ -973,27 +980,27 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16, ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.i v0, 15 -; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: li a0, 512 ; RV32-NEXT: li a1, 240 -; RV32-NEXT: vmv.s.x v8, a1 -; RV32-NEXT: li a1, 15 -; RV32-NEXT: vmerge.vim v10, v9, -1, v0 +; RV32-NEXT: vmerge.vim v9, v8, -1, v0 ; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV32-NEXT: vmv.v.i v12, 3 -; RV32-NEXT: slli a1, a1, 8 -; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmerge.vim v12, v12, 0, v0 -; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: li a1, 15 +; RV32-NEXT: slli a1, a1, 8 ; RV32-NEXT: vsetivli zero, 16, e32, mf2, ta, ma -; RV32-NEXT: vmerge.vim v10, v9, -1, v0 -; RV32-NEXT: vmv.s.x v8, a1 -; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vmerge.vim v9, v8, -1, v0 +; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV32-NEXT: vmerge.vim v12, v12, 1, v0 -; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: vsetivli zero, 16, e32, mf2, ta, ma -; RV32-NEXT: vmerge.vim v8, v9, -1, v0 +; RV32-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV32-NEXT: vmerge.vim v8, v12, 2, v0 @@ -1003,25 +1010,23 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16, ; RV64V: # %bb.0: ; RV64V-NEXT: vsetivli zero, 8, e64, m1, ta, ma ; RV64V-NEXT: vmv.v.i v0, 3 -; RV64V-NEXT: vmv.v.i v9, 0 +; RV64V-NEXT: vmv.v.i v8, 0 ; RV64V-NEXT: li a0, 512 -; RV64V-NEXT: vmv.v.i v8, 12 -; RV64V-NEXT: li a1, 48 -; RV64V-NEXT: vmerge.vim v10, v9, -1, v0 +; RV64V-NEXT: vmerge.vim v9, v8, -1, v0 ; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV64V-NEXT: vmv.v.i v12, 3 -; RV64V-NEXT: vmv1r.v v0, v10 +; RV64V-NEXT: vmv1r.v v0, v9 ; RV64V-NEXT: vmerge.vim v12, v12, 0, v0 -; RV64V-NEXT: vmv1r.v v0, v8 ; RV64V-NEXT: vsetivli zero, 8, e64, m1, ta, ma -; RV64V-NEXT: vmerge.vim v10, v9, -1, v0 -; RV64V-NEXT: vmv.s.x v8, a1 -; RV64V-NEXT: vmv.v.v v0, v10 +; RV64V-NEXT: vmv.v.i v0, 12 +; RV64V-NEXT: li a1, 48 +; RV64V-NEXT: vmerge.vim v9, v8, -1, v0 +; RV64V-NEXT: vmv.v.v v0, v9 ; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV64V-NEXT: vmerge.vim v12, v12, 1, v0 -; RV64V-NEXT: vmv1r.v v0, v8 +; RV64V-NEXT: vmv.s.x v0, a1 ; RV64V-NEXT: vsetivli zero, 8, e64, m1, ta, ma -; RV64V-NEXT: vmerge.vim v8, v9, -1, v0 +; RV64V-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64V-NEXT: vmv.v.v v0, v8 ; RV64V-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV64V-NEXT: vmerge.vim v8, v12, 2, v0 @@ -1031,27 +1036,27 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16, ; RV64ZVE32: # %bb.0: ; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m1, ta, ma ; RV64ZVE32-NEXT: vmv.v.i v0, 15 -; RV64ZVE32-NEXT: vmv.v.i v9, 0 +; RV64ZVE32-NEXT: vmv.v.i v8, 0 ; RV64ZVE32-NEXT: li a0, 512 ; RV64ZVE32-NEXT: li a1, 240 -; RV64ZVE32-NEXT: vmv.s.x v8, a1 -; RV64ZVE32-NEXT: li a1, 15 -; RV64ZVE32-NEXT: vmerge.vim v10, v9, -1, v0 +; RV64ZVE32-NEXT: vmerge.vim v9, v8, -1, v0 ; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV64ZVE32-NEXT: vmv.v.i v12, 3 -; RV64ZVE32-NEXT: slli a1, a1, 8 -; RV64ZVE32-NEXT: vmv1r.v v0, v10 +; RV64ZVE32-NEXT: vmv1r.v v0, v9 ; RV64ZVE32-NEXT: vmerge.vim v12, v12, 0, v0 -; RV64ZVE32-NEXT: vmv1r.v v0, v8 +; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v0, a1 +; RV64ZVE32-NEXT: li a1, 15 +; RV64ZVE32-NEXT: slli a1, a1, 8 ; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m1, ta, ma -; RV64ZVE32-NEXT: vmerge.vim v10, v9, -1, v0 -; RV64ZVE32-NEXT: vmv.s.x v8, a1 -; RV64ZVE32-NEXT: vmv.v.v v0, v10 +; RV64ZVE32-NEXT: vmerge.vim v9, v8, -1, v0 +; RV64ZVE32-NEXT: vmv.v.v v0, v9 ; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV64ZVE32-NEXT: vmerge.vim v12, v12, 1, v0 -; RV64ZVE32-NEXT: vmv1r.v v0, v8 +; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v0, a1 ; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m1, ta, ma -; RV64ZVE32-NEXT: vmerge.vim v8, v9, -1, v0 +; RV64ZVE32-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64ZVE32-NEXT: vmv.v.v v0, v8 ; RV64ZVE32-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; RV64ZVE32-NEXT: vmerge.vim v8, v12, 2, v0 @@ -1358,15 +1363,13 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV32-ONLY-NEXT: lbu t2, 9(a0) ; RV32-ONLY-NEXT: lbu t3, 10(a0) ; RV32-ONLY-NEXT: lbu t4, 11(a0) -; RV32-ONLY-NEXT: li t5, 255 -; RV32-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-ONLY-NEXT: vmv.s.x v0, t5 ; RV32-ONLY-NEXT: lbu t5, 12(a0) ; RV32-ONLY-NEXT: lbu t6, 13(a0) ; RV32-ONLY-NEXT: lbu s0, 14(a0) ; RV32-ONLY-NEXT: lbu a0, 15(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-ONLY-NEXT: vmv.v.x v8, a1 +; RV32-ONLY-NEXT: li a1, 255 ; RV32-ONLY-NEXT: vmv.v.x v9, t1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t2 @@ -1382,6 +1385,9 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, s0 ; RV32-ONLY-NEXT: vslide1down.vx v10, v8, t0 ; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a0 +; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-ONLY-NEXT: vmv.s.x v0, a1 +; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV32-ONLY-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-ONLY-NEXT: .cfi_restore s0 @@ -1417,24 +1423,24 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV32VB-NEXT: slli t1, t1, 24 ; RV32VB-NEXT: or a7, t0, a7 ; RV32VB-NEXT: or a4, a4, a5 -; RV32VB-NEXT: lbu a5, 12(a0) +; RV32VB-NEXT: or a5, t1, a6 +; RV32VB-NEXT: lbu a6, 12(a0) ; RV32VB-NEXT: lbu t0, 13(a0) -; RV32VB-NEXT: or a6, t1, a6 ; RV32VB-NEXT: lbu t1, 14(a0) ; RV32VB-NEXT: lbu a0, 15(a0) ; RV32VB-NEXT: slli t0, t0, 8 -; RV32VB-NEXT: or a5, a5, t0 +; RV32VB-NEXT: or a6, a6, t0 ; RV32VB-NEXT: slli t1, t1, 16 ; RV32VB-NEXT: slli a0, a0, 24 ; RV32VB-NEXT: or a0, a0, t1 ; RV32VB-NEXT: or a1, a1, a3 ; RV32VB-NEXT: or a2, a2, a7 -; RV32VB-NEXT: or a3, a4, a6 -; RV32VB-NEXT: or a0, a5, a0 +; RV32VB-NEXT: or a4, a4, a5 +; RV32VB-NEXT: or a0, a6, a0 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a2 -; RV32VB-NEXT: vslide1down.vx v8, v8, a3 +; RV32VB-NEXT: vslide1down.vx v8, v8, a4 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-NEXT: ret ; @@ -1449,29 +1455,29 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV32VB-PACK-NEXT: lbu a7, 6(a0) ; RV32VB-PACK-NEXT: lbu t0, 7(a0) ; RV32VB-PACK-NEXT: packh a1, a1, a2 -; RV32VB-PACK-NEXT: lbu a2, 8(a0) -; RV32VB-PACK-NEXT: lbu t1, 9(a0) -; RV32VB-PACK-NEXT: lbu t2, 10(a0) -; RV32VB-PACK-NEXT: lbu t3, 11(a0) -; RV32VB-PACK-NEXT: packh a3, a3, a4 -; RV32VB-PACK-NEXT: packh a4, a5, a6 -; RV32VB-PACK-NEXT: packh a5, a7, t0 +; RV32VB-PACK-NEXT: packh a2, a3, a4 +; RV32VB-PACK-NEXT: packh a3, a5, a6 +; RV32VB-PACK-NEXT: lbu a4, 8(a0) +; RV32VB-PACK-NEXT: lbu a5, 9(a0) +; RV32VB-PACK-NEXT: lbu a6, 10(a0) +; RV32VB-PACK-NEXT: lbu t1, 11(a0) +; RV32VB-PACK-NEXT: packh a7, a7, t0 +; RV32VB-PACK-NEXT: packh a4, a4, a5 +; RV32VB-PACK-NEXT: packh a5, a6, t1 ; RV32VB-PACK-NEXT: lbu a6, 12(a0) -; RV32VB-PACK-NEXT: lbu a7, 13(a0) -; RV32VB-PACK-NEXT: lbu t0, 14(a0) +; RV32VB-PACK-NEXT: lbu t0, 13(a0) +; RV32VB-PACK-NEXT: lbu t1, 14(a0) ; RV32VB-PACK-NEXT: lbu a0, 15(a0) -; RV32VB-PACK-NEXT: packh a2, a2, t1 -; RV32VB-PACK-NEXT: packh t1, t2, t3 -; RV32VB-PACK-NEXT: packh a6, a6, a7 -; RV32VB-PACK-NEXT: packh a0, t0, a0 -; RV32VB-PACK-NEXT: pack a1, a1, a3 +; RV32VB-PACK-NEXT: packh a6, a6, t0 +; RV32VB-PACK-NEXT: packh a0, t1, a0 +; RV32VB-PACK-NEXT: pack a1, a1, a2 +; RV32VB-PACK-NEXT: pack a2, a3, a7 ; RV32VB-PACK-NEXT: pack a3, a4, a5 -; RV32VB-PACK-NEXT: pack a2, a2, t1 ; RV32VB-PACK-NEXT: pack a0, a6, a0 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a1 -; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a3 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a2 +; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a3 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: ret ; @@ -1493,15 +1499,13 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV64V-ONLY-NEXT: lbu t2, 9(a0) ; RV64V-ONLY-NEXT: lbu t3, 10(a0) ; RV64V-ONLY-NEXT: lbu t4, 11(a0) -; RV64V-ONLY-NEXT: li t5, 255 -; RV64V-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64V-ONLY-NEXT: vmv.s.x v0, t5 ; RV64V-ONLY-NEXT: lbu t5, 12(a0) ; RV64V-ONLY-NEXT: lbu t6, 13(a0) ; RV64V-ONLY-NEXT: lbu s0, 14(a0) ; RV64V-ONLY-NEXT: lbu a0, 15(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64V-ONLY-NEXT: vmv.v.x v8, a1 +; RV64V-ONLY-NEXT: li a1, 255 ; RV64V-ONLY-NEXT: vmv.v.x v9, t1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t2 @@ -1517,6 +1521,9 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, s0 ; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, t0 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a0 +; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64V-ONLY-NEXT: vmv.s.x v0, a1 +; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64V-ONLY-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64V-ONLY-NEXT: .cfi_restore s0 @@ -1577,35 +1584,35 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; ; RVA22U64-PACK-LABEL: buildvec_v16i8_loads_contigous: ; RVA22U64-PACK: # %bb.0: -; RVA22U64-PACK-NEXT: lbu a1, 0(a0) +; RVA22U64-PACK-NEXT: lbu a6, 0(a0) ; RVA22U64-PACK-NEXT: lbu a2, 1(a0) -; RVA22U64-PACK-NEXT: lbu a6, 2(a0) -; RVA22U64-PACK-NEXT: lbu a7, 3(a0) -; RVA22U64-PACK-NEXT: lbu t0, 4(a0) -; RVA22U64-PACK-NEXT: lbu a3, 5(a0) -; RVA22U64-PACK-NEXT: lbu a4, 6(a0) -; RVA22U64-PACK-NEXT: lbu a5, 7(a0) -; RVA22U64-PACK-NEXT: packh t1, a1, a2 -; RVA22U64-PACK-NEXT: lbu t2, 8(a0) -; RVA22U64-PACK-NEXT: lbu t3, 9(a0) -; RVA22U64-PACK-NEXT: lbu t4, 10(a0) +; RVA22U64-PACK-NEXT: lbu a3, 2(a0) +; RVA22U64-PACK-NEXT: lbu a4, 3(a0) +; RVA22U64-PACK-NEXT: lbu a5, 4(a0) +; RVA22U64-PACK-NEXT: lbu a1, 5(a0) +; RVA22U64-PACK-NEXT: lbu a7, 6(a0) +; RVA22U64-PACK-NEXT: lbu t0, 7(a0) +; RVA22U64-PACK-NEXT: packh a6, a6, a2 +; RVA22U64-PACK-NEXT: packh t2, a3, a4 +; RVA22U64-PACK-NEXT: packh t1, a5, a1 +; RVA22U64-PACK-NEXT: lbu a4, 8(a0) +; RVA22U64-PACK-NEXT: lbu a5, 9(a0) +; RVA22U64-PACK-NEXT: lbu a2, 10(a0) ; RVA22U64-PACK-NEXT: lbu a1, 11(a0) -; RVA22U64-PACK-NEXT: packh a6, a6, a7 -; RVA22U64-PACK-NEXT: packh a7, t0, a3 -; RVA22U64-PACK-NEXT: packh t0, a4, a5 -; RVA22U64-PACK-NEXT: lbu a5, 12(a0) -; RVA22U64-PACK-NEXT: lbu a3, 13(a0) -; RVA22U64-PACK-NEXT: lbu a2, 14(a0) +; RVA22U64-PACK-NEXT: packh a7, a7, t0 +; RVA22U64-PACK-NEXT: packh a4, a4, a5 +; RVA22U64-PACK-NEXT: packh a1, a2, a1 +; RVA22U64-PACK-NEXT: lbu a2, 12(a0) +; RVA22U64-PACK-NEXT: lbu a5, 13(a0) +; RVA22U64-PACK-NEXT: lbu a3, 14(a0) ; RVA22U64-PACK-NEXT: lbu a0, 15(a0) -; RVA22U64-PACK-NEXT: packh a4, t2, t3 -; RVA22U64-PACK-NEXT: packh a1, t4, a1 -; RVA22U64-PACK-NEXT: packh a3, a5, a3 -; RVA22U64-PACK-NEXT: packh a0, a2, a0 -; RVA22U64-PACK-NEXT: packw a2, t1, a6 -; RVA22U64-PACK-NEXT: packw a5, a7, t0 +; RVA22U64-PACK-NEXT: packh a2, a2, a5 +; RVA22U64-PACK-NEXT: packh a0, a3, a0 +; RVA22U64-PACK-NEXT: packw a3, a6, t2 +; RVA22U64-PACK-NEXT: packw a5, t1, a7 ; RVA22U64-PACK-NEXT: packw a1, a4, a1 -; RVA22U64-PACK-NEXT: packw a0, a3, a0 -; RVA22U64-PACK-NEXT: pack a2, a2, a5 +; RVA22U64-PACK-NEXT: packw a0, a2, a0 +; RVA22U64-PACK-NEXT: pack a2, a3, a5 ; RVA22U64-PACK-NEXT: pack a0, a1, a0 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-PACK-NEXT: vmv.v.x v8, a2 @@ -1630,15 +1637,13 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV64ZVE32-NEXT: lbu t2, 9(a0) ; RV64ZVE32-NEXT: lbu t3, 10(a0) ; RV64ZVE32-NEXT: lbu t4, 11(a0) -; RV64ZVE32-NEXT: li t5, 255 -; RV64ZVE32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32-NEXT: vmv.s.x v0, t5 ; RV64ZVE32-NEXT: lbu t5, 12(a0) ; RV64ZVE32-NEXT: lbu t6, 13(a0) ; RV64ZVE32-NEXT: lbu s0, 14(a0) ; RV64ZVE32-NEXT: lbu a0, 15(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64ZVE32-NEXT: vmv.v.x v8, a1 +; RV64ZVE32-NEXT: li a1, 255 ; RV64ZVE32-NEXT: vmv.v.x v9, t1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t2 @@ -1654,6 +1659,9 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) { ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, s0 ; RV64ZVE32-NEXT: vslide1down.vx v10, v8, t0 ; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v0, a1 +; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64ZVE32-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64ZVE32-NEXT: .cfi_restore s0 @@ -1732,15 +1740,13 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV32-ONLY-NEXT: lbu t2, 154(a0) ; RV32-ONLY-NEXT: lbu t3, 161(a0) ; RV32-ONLY-NEXT: lbu t4, 163(a0) -; RV32-ONLY-NEXT: li t5, 255 -; RV32-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-ONLY-NEXT: vmv.s.x v0, t5 ; RV32-ONLY-NEXT: lbu t5, 93(a0) ; RV32-ONLY-NEXT: lbu t6, 105(a0) ; RV32-ONLY-NEXT: lbu s0, 124(a0) ; RV32-ONLY-NEXT: lbu a0, 144(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-ONLY-NEXT: vmv.v.x v8, a1 +; RV32-ONLY-NEXT: li a1, 255 ; RV32-ONLY-NEXT: vmv.v.x v9, t1 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t5 @@ -1756,6 +1762,9 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a0 ; RV32-ONLY-NEXT: vslide1down.vx v10, v8, t0 ; RV32-ONLY-NEXT: vslide1down.vx v8, v9, t2 +; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-ONLY-NEXT: vmv.s.x v0, a1 +; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV32-ONLY-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-ONLY-NEXT: .cfi_restore s0 @@ -1777,38 +1786,38 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV32VB-NEXT: slli a2, a2, 8 ; RV32VB-NEXT: slli a3, a3, 16 ; RV32VB-NEXT: slli a4, a4, 24 +; RV32VB-NEXT: slli a7, a7, 8 ; RV32VB-NEXT: or a1, a1, a2 ; RV32VB-NEXT: or a3, a4, a3 -; RV32VB-NEXT: lbu a2, 93(a0) -; RV32VB-NEXT: lbu a4, 105(a0) -; RV32VB-NEXT: lbu t2, 124(a0) -; RV32VB-NEXT: lbu t3, 144(a0) -; RV32VB-NEXT: slli a7, a7, 8 +; RV32VB-NEXT: or a2, a6, a7 +; RV32VB-NEXT: lbu a4, 93(a0) +; RV32VB-NEXT: lbu a6, 105(a0) +; RV32VB-NEXT: lbu a7, 124(a0) +; RV32VB-NEXT: lbu t2, 144(a0) ; RV32VB-NEXT: slli a5, a5, 16 ; RV32VB-NEXT: slli t0, t0, 24 -; RV32VB-NEXT: slli a2, a2, 8 -; RV32VB-NEXT: or a6, a6, a7 +; RV32VB-NEXT: slli a4, a4, 8 ; RV32VB-NEXT: or a5, t0, a5 -; RV32VB-NEXT: lbu a7, 154(a0) -; RV32VB-NEXT: lbu t0, 161(a0) -; RV32VB-NEXT: or a2, t1, a2 +; RV32VB-NEXT: or a4, t1, a4 +; RV32VB-NEXT: lbu t0, 154(a0) +; RV32VB-NEXT: lbu t1, 161(a0) ; RV32VB-NEXT: lbu a0, 163(a0) -; RV32VB-NEXT: slli a4, a4, 16 -; RV32VB-NEXT: slli t0, t0, 24 -; RV32VB-NEXT: or a4, t0, a4 +; RV32VB-NEXT: slli a6, a6, 16 +; RV32VB-NEXT: slli t1, t1, 24 +; RV32VB-NEXT: or a6, t1, a6 ; RV32VB-NEXT: slli a0, a0, 8 -; RV32VB-NEXT: or a0, t2, a0 -; RV32VB-NEXT: slli t3, t3, 16 -; RV32VB-NEXT: slli a7, a7, 24 -; RV32VB-NEXT: or a7, a7, t3 +; RV32VB-NEXT: or a0, a7, a0 +; RV32VB-NEXT: slli t2, t2, 16 +; RV32VB-NEXT: slli t0, t0, 24 +; RV32VB-NEXT: or a7, t0, t2 ; RV32VB-NEXT: or a1, a1, a3 -; RV32VB-NEXT: or a3, a6, a5 -; RV32VB-NEXT: or a2, a2, a4 +; RV32VB-NEXT: or a2, a2, a5 +; RV32VB-NEXT: or a3, a4, a6 ; RV32VB-NEXT: or a0, a0, a7 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a1 -; RV32VB-NEXT: vslide1down.vx v8, v8, a3 ; RV32VB-NEXT: vslide1down.vx v8, v8, a2 +; RV32VB-NEXT: vslide1down.vx v8, v8, a3 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-NEXT: ret ; @@ -1824,24 +1833,24 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV32VB-PACK-NEXT: lbu t0, 75(a0) ; RV32VB-PACK-NEXT: lbu t1, 82(a0) ; RV32VB-PACK-NEXT: packh a1, a1, a2 -; RV32VB-PACK-NEXT: lbu a2, 154(a0) -; RV32VB-PACK-NEXT: lbu t2, 161(a0) -; RV32VB-PACK-NEXT: lbu t3, 163(a0) -; RV32VB-PACK-NEXT: packh a3, a3, a4 -; RV32VB-PACK-NEXT: packh a4, a6, a7 +; RV32VB-PACK-NEXT: packh a2, a3, a4 +; RV32VB-PACK-NEXT: packh a3, a6, a7 +; RV32VB-PACK-NEXT: lbu a4, 93(a0) +; RV32VB-PACK-NEXT: lbu a6, 105(a0) +; RV32VB-PACK-NEXT: lbu a7, 124(a0) +; RV32VB-PACK-NEXT: lbu t2, 144(a0) ; RV32VB-PACK-NEXT: packh a5, a5, t0 -; RV32VB-PACK-NEXT: lbu a6, 93(a0) -; RV32VB-PACK-NEXT: lbu a7, 105(a0) -; RV32VB-PACK-NEXT: lbu t0, 124(a0) -; RV32VB-PACK-NEXT: lbu a0, 144(a0) -; RV32VB-PACK-NEXT: packh a6, t1, a6 -; RV32VB-PACK-NEXT: packh a7, a7, t2 -; RV32VB-PACK-NEXT: packh t0, t0, t3 -; RV32VB-PACK-NEXT: packh a0, a0, a2 -; RV32VB-PACK-NEXT: pack a1, a1, a3 -; RV32VB-PACK-NEXT: pack a2, a4, a5 -; RV32VB-PACK-NEXT: pack a3, a6, a7 -; RV32VB-PACK-NEXT: pack a0, t0, a0 +; RV32VB-PACK-NEXT: packh a4, t1, a4 +; RV32VB-PACK-NEXT: lbu t0, 154(a0) +; RV32VB-PACK-NEXT: lbu t1, 161(a0) +; RV32VB-PACK-NEXT: lbu a0, 163(a0) +; RV32VB-PACK-NEXT: packh a6, a6, t1 +; RV32VB-PACK-NEXT: packh a0, a7, a0 +; RV32VB-PACK-NEXT: packh a7, t2, t0 +; RV32VB-PACK-NEXT: pack a1, a1, a2 +; RV32VB-PACK-NEXT: pack a2, a3, a5 +; RV32VB-PACK-NEXT: pack a3, a4, a6 +; RV32VB-PACK-NEXT: pack a0, a0, a7 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a1 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a2 @@ -1867,15 +1876,13 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV64V-ONLY-NEXT: lbu t2, 154(a0) ; RV64V-ONLY-NEXT: lbu t3, 161(a0) ; RV64V-ONLY-NEXT: lbu t4, 163(a0) -; RV64V-ONLY-NEXT: li t5, 255 -; RV64V-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64V-ONLY-NEXT: vmv.s.x v0, t5 ; RV64V-ONLY-NEXT: lbu t5, 93(a0) ; RV64V-ONLY-NEXT: lbu t6, 105(a0) ; RV64V-ONLY-NEXT: lbu s0, 124(a0) ; RV64V-ONLY-NEXT: lbu a0, 144(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64V-ONLY-NEXT: vmv.v.x v8, a1 +; RV64V-ONLY-NEXT: li a1, 255 ; RV64V-ONLY-NEXT: vmv.v.x v9, t1 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t5 @@ -1891,6 +1898,9 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a0 ; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, t0 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, t2 +; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64V-ONLY-NEXT: vmv.s.x v0, a1 +; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64V-ONLY-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64V-ONLY-NEXT: .cfi_restore s0 @@ -1900,98 +1910,90 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; ; RVA22U64-LABEL: buildvec_v16i8_loads_gather: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a1, 0(a0) +; RVA22U64-NEXT: lbu a7, 0(a0) ; RVA22U64-NEXT: lbu a2, 1(a0) ; RVA22U64-NEXT: lbu a3, 22(a0) ; RVA22U64-NEXT: lbu a4, 31(a0) ; RVA22U64-NEXT: lbu a6, 623(a0) -; RVA22U64-NEXT: lbu t0, 44(a0) -; RVA22U64-NEXT: lbu a7, 55(a0) -; RVA22U64-NEXT: lbu a5, 75(a0) +; RVA22U64-NEXT: lbu a5, 44(a0) +; RVA22U64-NEXT: lbu a1, 55(a0) +; RVA22U64-NEXT: lbu t0, 75(a0) ; RVA22U64-NEXT: lbu t1, 82(a0) ; RVA22U64-NEXT: slli a2, a2, 8 ; RVA22U64-NEXT: slli a3, a3, 16 ; RVA22U64-NEXT: slli a4, a4, 24 -; RVA22U64-NEXT: or t2, a1, a2 +; RVA22U64-NEXT: slli a5, a5, 32 +; RVA22U64-NEXT: slli a1, a1, 40 +; RVA22U64-NEXT: or a7, a7, a2 ; RVA22U64-NEXT: or t3, a4, a3 -; RVA22U64-NEXT: lbu a2, 93(a0) +; RVA22U64-NEXT: or t2, a1, a5 +; RVA22U64-NEXT: lbu a4, 93(a0) ; RVA22U64-NEXT: lbu t4, 105(a0) -; RVA22U64-NEXT: lbu t6, 124(a0) +; RVA22U64-NEXT: lbu a2, 124(a0) ; RVA22U64-NEXT: lbu t5, 144(a0) -; RVA22U64-NEXT: slli t0, t0, 32 -; RVA22U64-NEXT: slli a7, a7, 40 ; RVA22U64-NEXT: slli a6, a6, 48 -; RVA22U64-NEXT: slli a5, a5, 56 -; RVA22U64-NEXT: slli a2, a2, 8 -; RVA22U64-NEXT: or a7, a7, t0 -; RVA22U64-NEXT: or a5, a5, a6 -; RVA22U64-NEXT: lbu a3, 154(a0) +; RVA22U64-NEXT: slli t0, t0, 56 +; RVA22U64-NEXT: slli a4, a4, 8 +; RVA22U64-NEXT: or a3, t0, a6 +; RVA22U64-NEXT: or a4, t1, a4 +; RVA22U64-NEXT: lbu a5, 154(a0) ; RVA22U64-NEXT: lbu a1, 161(a0) -; RVA22U64-NEXT: or a2, t1, a2 ; RVA22U64-NEXT: lbu a0, 163(a0) ; RVA22U64-NEXT: slli t4, t4, 16 ; RVA22U64-NEXT: slli a1, a1, 24 ; RVA22U64-NEXT: or a1, a1, t4 -; RVA22U64-NEXT: slli t6, t6, 32 +; RVA22U64-NEXT: slli a2, a2, 32 ; RVA22U64-NEXT: slli a0, a0, 40 -; RVA22U64-NEXT: or a0, a0, t6 +; RVA22U64-NEXT: or a0, a0, a2 ; RVA22U64-NEXT: slli t5, t5, 48 -; RVA22U64-NEXT: slli a3, a3, 56 -; RVA22U64-NEXT: or a3, a3, t5 -; RVA22U64-NEXT: or a4, t2, t3 -; RVA22U64-NEXT: or a5, a5, a7 -; RVA22U64-NEXT: or a1, a1, a2 -; RVA22U64-NEXT: or a0, a0, a3 -; RVA22U64-NEXT: or a4, a4, a5 +; RVA22U64-NEXT: slli a5, a5, 56 +; RVA22U64-NEXT: or a2, a5, t5 +; RVA22U64-NEXT: or a5, a7, t3 +; RVA22U64-NEXT: or a3, a3, t2 +; RVA22U64-NEXT: or a1, a1, a4 +; RVA22U64-NEXT: or a0, a0, a2 +; RVA22U64-NEXT: or a3, a3, a5 ; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-NEXT: vmv.v.x v8, a4 +; RVA22U64-NEXT: vmv.v.x v8, a3 ; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-NEXT: ret ; ; RVA22U64-PACK-LABEL: buildvec_v16i8_loads_gather: ; RVA22U64-PACK: # %bb.0: -; RVA22U64-PACK-NEXT: addi sp, sp, -16 -; RVA22U64-PACK-NEXT: .cfi_def_cfa_offset 16 -; RVA22U64-PACK-NEXT: sd s0, 8(sp) # 8-byte Folded Spill -; RVA22U64-PACK-NEXT: .cfi_offset s0, -8 -; RVA22U64-PACK-NEXT: lbu a1, 0(a0) -; RVA22U64-PACK-NEXT: lbu a2, 1(a0) -; RVA22U64-PACK-NEXT: lbu a6, 22(a0) -; RVA22U64-PACK-NEXT: lbu a7, 31(a0) -; RVA22U64-PACK-NEXT: lbu t0, 623(a0) -; RVA22U64-PACK-NEXT: lbu t3, 44(a0) -; RVA22U64-PACK-NEXT: lbu t4, 55(a0) -; RVA22U64-PACK-NEXT: lbu t5, 75(a0) -; RVA22U64-PACK-NEXT: lbu t1, 82(a0) -; RVA22U64-PACK-NEXT: packh t2, a1, a2 -; RVA22U64-PACK-NEXT: lbu t6, 154(a0) -; RVA22U64-PACK-NEXT: lbu s0, 161(a0) -; RVA22U64-PACK-NEXT: lbu a3, 163(a0) -; RVA22U64-PACK-NEXT: packh a6, a6, a7 -; RVA22U64-PACK-NEXT: packh a7, t3, t4 -; RVA22U64-PACK-NEXT: packh a2, t0, t5 +; RVA22U64-PACK-NEXT: lbu a7, 0(a0) +; RVA22U64-PACK-NEXT: lbu t1, 1(a0) +; RVA22U64-PACK-NEXT: lbu a3, 22(a0) +; RVA22U64-PACK-NEXT: lbu a4, 31(a0) +; RVA22U64-PACK-NEXT: lbu a6, 623(a0) +; RVA22U64-PACK-NEXT: lbu a5, 44(a0) +; RVA22U64-PACK-NEXT: lbu a1, 55(a0) +; RVA22U64-PACK-NEXT: lbu t0, 75(a0) +; RVA22U64-PACK-NEXT: lbu t3, 82(a0) +; RVA22U64-PACK-NEXT: packh a7, a7, t1 +; RVA22U64-PACK-NEXT: packh t2, a3, a4 +; RVA22U64-PACK-NEXT: packh t1, a5, a1 ; RVA22U64-PACK-NEXT: lbu a4, 93(a0) -; RVA22U64-PACK-NEXT: lbu a5, 105(a0) -; RVA22U64-PACK-NEXT: lbu a1, 124(a0) -; RVA22U64-PACK-NEXT: lbu a0, 144(a0) -; RVA22U64-PACK-NEXT: packh a4, t1, a4 -; RVA22U64-PACK-NEXT: packh a5, a5, s0 -; RVA22U64-PACK-NEXT: packh a1, a1, a3 -; RVA22U64-PACK-NEXT: packh a0, a0, t6 -; RVA22U64-PACK-NEXT: packw a3, t2, a6 -; RVA22U64-PACK-NEXT: packw a2, a7, a2 -; RVA22U64-PACK-NEXT: packw a4, a4, a5 -; RVA22U64-PACK-NEXT: packw a0, a1, a0 -; RVA22U64-PACK-NEXT: pack a1, a3, a2 -; RVA22U64-PACK-NEXT: pack a0, a4, a0 +; RVA22U64-PACK-NEXT: lbu t4, 105(a0) +; RVA22U64-PACK-NEXT: lbu t5, 124(a0) +; RVA22U64-PACK-NEXT: lbu a3, 144(a0) +; RVA22U64-PACK-NEXT: packh a2, a6, t0 +; RVA22U64-PACK-NEXT: packh a4, t3, a4 +; RVA22U64-PACK-NEXT: lbu a5, 154(a0) +; RVA22U64-PACK-NEXT: lbu a1, 161(a0) +; RVA22U64-PACK-NEXT: lbu a0, 163(a0) +; RVA22U64-PACK-NEXT: packh a1, t4, a1 +; RVA22U64-PACK-NEXT: packh a0, t5, a0 +; RVA22U64-PACK-NEXT: packh a3, a3, a5 +; RVA22U64-PACK-NEXT: packw a5, a7, t2 +; RVA22U64-PACK-NEXT: packw a2, t1, a2 +; RVA22U64-PACK-NEXT: packw a1, a4, a1 +; RVA22U64-PACK-NEXT: packw a0, a0, a3 +; RVA22U64-PACK-NEXT: pack a2, a5, a2 +; RVA22U64-PACK-NEXT: pack a0, a1, a0 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-PACK-NEXT: vmv.v.x v8, a1 +; RVA22U64-PACK-NEXT: vmv.v.x v8, a2 ; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a0 -; RVA22U64-PACK-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RVA22U64-PACK-NEXT: .cfi_restore s0 -; RVA22U64-PACK-NEXT: addi sp, sp, 16 -; RVA22U64-PACK-NEXT: .cfi_def_cfa_offset 0 ; RVA22U64-PACK-NEXT: ret ; ; RV64ZVE32-LABEL: buildvec_v16i8_loads_gather: @@ -2012,15 +2014,13 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV64ZVE32-NEXT: lbu t2, 154(a0) ; RV64ZVE32-NEXT: lbu t3, 161(a0) ; RV64ZVE32-NEXT: lbu t4, 163(a0) -; RV64ZVE32-NEXT: li t5, 255 -; RV64ZVE32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32-NEXT: vmv.s.x v0, t5 ; RV64ZVE32-NEXT: lbu t5, 93(a0) ; RV64ZVE32-NEXT: lbu t6, 105(a0) ; RV64ZVE32-NEXT: lbu s0, 124(a0) ; RV64ZVE32-NEXT: lbu a0, 144(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64ZVE32-NEXT: vmv.v.x v8, a1 +; RV64ZVE32-NEXT: li a1, 255 ; RV64ZVE32-NEXT: vmv.v.x v9, t1 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t5 @@ -2036,6 +2036,9 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) { ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a0 ; RV64ZVE32-NEXT: vslide1down.vx v10, v8, t0 ; RV64ZVE32-NEXT: vslide1down.vx v8, v9, t2 +; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v0, a1 +; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64ZVE32-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; RV64ZVE32-NEXT: .cfi_restore s0 @@ -2118,28 +2121,28 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; ; RV32VB-LABEL: buildvec_v16i8_undef_low_half: ; RV32VB: # %bb.0: -; RV32VB-NEXT: lbu a1, 93(a0) -; RV32VB-NEXT: lbu a2, 82(a0) +; RV32VB-NEXT: lbu a1, 82(a0) +; RV32VB-NEXT: lbu a2, 93(a0) ; RV32VB-NEXT: lbu a3, 105(a0) ; RV32VB-NEXT: lbu a4, 124(a0) -; RV32VB-NEXT: slli a1, a1, 8 -; RV32VB-NEXT: lbu a5, 144(a0) -; RV32VB-NEXT: lbu a6, 154(a0) -; RV32VB-NEXT: lbu a7, 161(a0) -; RV32VB-NEXT: or a1, a2, a1 +; RV32VB-NEXT: slli a2, a2, 8 +; RV32VB-NEXT: or a1, a1, a2 +; RV32VB-NEXT: lbu a2, 144(a0) +; RV32VB-NEXT: lbu a5, 154(a0) +; RV32VB-NEXT: lbu a6, 161(a0) ; RV32VB-NEXT: lbu a0, 163(a0) ; RV32VB-NEXT: slli a3, a3, 16 -; RV32VB-NEXT: slli a7, a7, 24 -; RV32VB-NEXT: or a2, a7, a3 +; RV32VB-NEXT: slli a6, a6, 24 +; RV32VB-NEXT: or a3, a6, a3 ; RV32VB-NEXT: slli a0, a0, 8 ; RV32VB-NEXT: or a0, a4, a0 -; RV32VB-NEXT: slli a5, a5, 16 -; RV32VB-NEXT: slli a6, a6, 24 -; RV32VB-NEXT: or a3, a6, a5 +; RV32VB-NEXT: slli a2, a2, 16 +; RV32VB-NEXT: slli a5, a5, 24 +; RV32VB-NEXT: or a2, a5, a2 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.i v8, 0 -; RV32VB-NEXT: or a1, a1, a2 -; RV32VB-NEXT: or a0, a0, a3 +; RV32VB-NEXT: or a1, a1, a3 +; RV32VB-NEXT: or a0, a0, a2 ; RV32VB-NEXT: vslide1down.vx v8, v8, zero ; RV32VB-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 @@ -2151,21 +2154,21 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; RV32VB-PACK-NEXT: lbu a2, 93(a0) ; RV32VB-PACK-NEXT: lbu a3, 105(a0) ; RV32VB-PACK-NEXT: lbu a4, 124(a0) -; RV32VB-PACK-NEXT: lbu a5, 161(a0) -; RV32VB-PACK-NEXT: lbu a6, 163(a0) -; RV32VB-PACK-NEXT: lbu a7, 144(a0) -; RV32VB-PACK-NEXT: lbu a0, 154(a0) ; RV32VB-PACK-NEXT: packh a1, a1, a2 -; RV32VB-PACK-NEXT: packh a2, a3, a5 -; RV32VB-PACK-NEXT: packh a3, a4, a6 -; RV32VB-PACK-NEXT: packh a0, a7, a0 -; RV32VB-PACK-NEXT: pack a1, a1, a2 -; RV32VB-PACK-NEXT: packh a2, a0, a0 -; RV32VB-PACK-NEXT: pack a2, a2, a2 +; RV32VB-PACK-NEXT: lbu a2, 144(a0) +; RV32VB-PACK-NEXT: lbu a5, 154(a0) +; RV32VB-PACK-NEXT: lbu a6, 161(a0) +; RV32VB-PACK-NEXT: lbu a0, 163(a0) +; RV32VB-PACK-NEXT: packh a3, a3, a6 +; RV32VB-PACK-NEXT: packh a0, a4, a0 +; RV32VB-PACK-NEXT: packh a2, a2, a5 +; RV32VB-PACK-NEXT: pack a1, a1, a3 +; RV32VB-PACK-NEXT: packh a3, a0, a0 +; RV32VB-PACK-NEXT: pack a3, a3, a3 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32VB-PACK-NEXT: vmv.v.x v8, a2 -; RV32VB-PACK-NEXT: pack a0, a3, a0 -; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a2 +; RV32VB-PACK-NEXT: vmv.v.x v8, a3 +; RV32VB-PACK-NEXT: pack a0, a0, a2 +; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a3 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: ret @@ -2193,26 +2196,26 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; ; RVA22U64-LABEL: buildvec_v16i8_undef_low_half: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a1, 93(a0) -; RVA22U64-NEXT: lbu a6, 82(a0) -; RVA22U64-NEXT: lbu a7, 105(a0) +; RVA22U64-NEXT: lbu a1, 82(a0) +; RVA22U64-NEXT: lbu a2, 93(a0) +; RVA22U64-NEXT: lbu a3, 105(a0) ; RVA22U64-NEXT: lbu a4, 124(a0) -; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: lbu a5, 144(a0) -; RVA22U64-NEXT: lbu a2, 154(a0) -; RVA22U64-NEXT: lbu a3, 161(a0) -; RVA22U64-NEXT: or a1, a6, a1 +; RVA22U64-NEXT: slli a2, a2, 8 +; RVA22U64-NEXT: or a6, a1, a2 +; RVA22U64-NEXT: lbu a2, 144(a0) +; RVA22U64-NEXT: lbu a5, 154(a0) +; RVA22U64-NEXT: lbu a1, 161(a0) ; RVA22U64-NEXT: lbu a0, 163(a0) -; RVA22U64-NEXT: slli a7, a7, 16 -; RVA22U64-NEXT: slli a3, a3, 24 -; RVA22U64-NEXT: or a3, a3, a7 +; RVA22U64-NEXT: slli a3, a3, 16 +; RVA22U64-NEXT: slli a1, a1, 24 +; RVA22U64-NEXT: or a1, a1, a3 ; RVA22U64-NEXT: slli a4, a4, 32 ; RVA22U64-NEXT: slli a0, a0, 40 ; RVA22U64-NEXT: or a0, a0, a4 -; RVA22U64-NEXT: slli a5, a5, 48 -; RVA22U64-NEXT: slli a2, a2, 56 +; RVA22U64-NEXT: slli a2, a2, 48 +; RVA22U64-NEXT: slli a5, a5, 56 ; RVA22U64-NEXT: or a2, a2, a5 -; RVA22U64-NEXT: or a1, a1, a3 +; RVA22U64-NEXT: or a1, a6, a1 ; RVA22U64-NEXT: or a0, a0, a2 ; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -2222,24 +2225,24 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) { ; ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_low_half: ; RVA22U64-PACK: # %bb.0: -; RVA22U64-PACK-NEXT: lbu a6, 82(a0) -; RVA22U64-PACK-NEXT: lbu a7, 93(a0) -; RVA22U64-PACK-NEXT: lbu t0, 105(a0) +; RVA22U64-PACK-NEXT: lbu a1, 82(a0) +; RVA22U64-PACK-NEXT: lbu a2, 93(a0) +; RVA22U64-PACK-NEXT: lbu a6, 105(a0) ; RVA22U64-PACK-NEXT: lbu a4, 124(a0) -; RVA22U64-PACK-NEXT: lbu a5, 161(a0) -; RVA22U64-PACK-NEXT: lbu a1, 163(a0) +; RVA22U64-PACK-NEXT: packh a1, a1, a2 ; RVA22U64-PACK-NEXT: lbu a2, 144(a0) -; RVA22U64-PACK-NEXT: lbu a0, 154(a0) -; RVA22U64-PACK-NEXT: packh a3, a6, a7 -; RVA22U64-PACK-NEXT: packh a5, t0, a5 -; RVA22U64-PACK-NEXT: packh a1, a4, a1 -; RVA22U64-PACK-NEXT: packh a0, a2, a0 -; RVA22U64-PACK-NEXT: packw a2, a3, a5 +; RVA22U64-PACK-NEXT: lbu a5, 154(a0) +; RVA22U64-PACK-NEXT: lbu a3, 161(a0) +; RVA22U64-PACK-NEXT: lbu a0, 163(a0) +; RVA22U64-PACK-NEXT: packh a3, a6, a3 +; RVA22U64-PACK-NEXT: packh a0, a4, a0 +; RVA22U64-PACK-NEXT: packh a2, a2, a5 +; RVA22U64-PACK-NEXT: packw a1, a1, a3 ; RVA22U64-PACK-NEXT: packh a3, a0, a0 ; RVA22U64-PACK-NEXT: packw a3, a3, a3 ; RVA22U64-PACK-NEXT: pack a3, a3, a3 -; RVA22U64-PACK-NEXT: packw a0, a1, a0 -; RVA22U64-PACK-NEXT: pack a0, a2, a0 +; RVA22U64-PACK-NEXT: packw a0, a0, a2 +; RVA22U64-PACK-NEXT: pack a0, a1, a0 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-PACK-NEXT: vmv.v.x v8, a3 ; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a0 @@ -2319,26 +2322,26 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; ; RV32VB-LABEL: buildvec_v16i8_undef_high_half: ; RV32VB: # %bb.0: -; RV32VB-NEXT: lbu a1, 1(a0) -; RV32VB-NEXT: lbu a2, 22(a0) -; RV32VB-NEXT: lbu a3, 31(a0) -; RV32VB-NEXT: lbu a4, 0(a0) -; RV32VB-NEXT: slli a1, a1, 8 -; RV32VB-NEXT: slli a2, a2, 16 -; RV32VB-NEXT: slli a3, a3, 24 -; RV32VB-NEXT: or a1, a4, a1 -; RV32VB-NEXT: lbu a4, 44(a0) -; RV32VB-NEXT: lbu a5, 55(a0) -; RV32VB-NEXT: or a2, a3, a2 -; RV32VB-NEXT: lbu a3, 623(a0) -; RV32VB-NEXT: lbu a0, 75(a0) -; RV32VB-NEXT: slli a5, a5, 8 -; RV32VB-NEXT: or a4, a4, a5 +; RV32VB-NEXT: lbu a1, 0(a0) +; RV32VB-NEXT: lbu a2, 1(a0) +; RV32VB-NEXT: lbu a3, 22(a0) +; RV32VB-NEXT: lbu a4, 31(a0) +; RV32VB-NEXT: slli a2, a2, 8 ; RV32VB-NEXT: slli a3, a3, 16 -; RV32VB-NEXT: slli a0, a0, 24 -; RV32VB-NEXT: or a0, a0, a3 +; RV32VB-NEXT: slli a4, a4, 24 ; RV32VB-NEXT: or a1, a1, a2 -; RV32VB-NEXT: or a0, a4, a0 +; RV32VB-NEXT: or a3, a4, a3 +; RV32VB-NEXT: lbu a2, 44(a0) +; RV32VB-NEXT: lbu a4, 55(a0) +; RV32VB-NEXT: lbu a5, 75(a0) +; RV32VB-NEXT: slli a4, a4, 8 +; RV32VB-NEXT: or a2, a2, a4 +; RV32VB-NEXT: lbu a0, 623(a0) +; RV32VB-NEXT: slli a0, a0, 16 +; RV32VB-NEXT: slli a5, a5, 24 +; RV32VB-NEXT: or a0, a5, a0 +; RV32VB-NEXT: or a1, a1, a3 +; RV32VB-NEXT: or a0, a2, a0 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 @@ -2352,14 +2355,14 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; RV32VB-PACK-NEXT: lbu a2, 1(a0) ; RV32VB-PACK-NEXT: lbu a3, 22(a0) ; RV32VB-PACK-NEXT: lbu a4, 31(a0) -; RV32VB-PACK-NEXT: lbu a5, 623(a0) -; RV32VB-PACK-NEXT: lbu a6, 44(a0) -; RV32VB-PACK-NEXT: lbu a7, 55(a0) -; RV32VB-PACK-NEXT: lbu a0, 75(a0) ; RV32VB-PACK-NEXT: packh a1, a1, a2 ; RV32VB-PACK-NEXT: packh a2, a3, a4 -; RV32VB-PACK-NEXT: packh a3, a6, a7 -; RV32VB-PACK-NEXT: packh a0, a5, a0 +; RV32VB-PACK-NEXT: lbu a3, 44(a0) +; RV32VB-PACK-NEXT: lbu a4, 55(a0) +; RV32VB-PACK-NEXT: lbu a5, 75(a0) +; RV32VB-PACK-NEXT: packh a3, a3, a4 +; RV32VB-PACK-NEXT: lbu a0, 623(a0) +; RV32VB-PACK-NEXT: packh a0, a0, a5 ; RV32VB-PACK-NEXT: pack a1, a1, a2 ; RV32VB-PACK-NEXT: packh a2, a0, a0 ; RV32VB-PACK-NEXT: pack a0, a3, a0 @@ -2395,27 +2398,27 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; ; RVA22U64-LABEL: buildvec_v16i8_undef_high_half: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a1, 1(a0) -; RVA22U64-NEXT: lbu a2, 22(a0) -; RVA22U64-NEXT: lbu a3, 31(a0) -; RVA22U64-NEXT: lbu a4, 0(a0) -; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: slli a2, a2, 16 -; RVA22U64-NEXT: slli a3, a3, 24 -; RVA22U64-NEXT: or a1, a1, a4 -; RVA22U64-NEXT: or a2, a2, a3 -; RVA22U64-NEXT: lbu a3, 44(a0) +; RVA22U64-NEXT: lbu a1, 0(a0) +; RVA22U64-NEXT: lbu a2, 1(a0) +; RVA22U64-NEXT: lbu a3, 22(a0) +; RVA22U64-NEXT: lbu a4, 31(a0) +; RVA22U64-NEXT: slli a2, a2, 8 +; RVA22U64-NEXT: slli a3, a3, 16 +; RVA22U64-NEXT: slli a4, a4, 24 +; RVA22U64-NEXT: or a1, a1, a2 +; RVA22U64-NEXT: or a3, a3, a4 +; RVA22U64-NEXT: lbu a2, 44(a0) ; RVA22U64-NEXT: lbu a4, 55(a0) -; RVA22U64-NEXT: lbu a5, 623(a0) -; RVA22U64-NEXT: lbu a0, 75(a0) -; RVA22U64-NEXT: slli a3, a3, 32 +; RVA22U64-NEXT: lbu a5, 75(a0) +; RVA22U64-NEXT: slli a2, a2, 32 ; RVA22U64-NEXT: slli a4, a4, 40 -; RVA22U64-NEXT: or a3, a3, a4 -; RVA22U64-NEXT: slli a5, a5, 48 -; RVA22U64-NEXT: slli a0, a0, 56 +; RVA22U64-NEXT: or a2, a2, a4 +; RVA22U64-NEXT: lbu a0, 623(a0) +; RVA22U64-NEXT: slli a0, a0, 48 +; RVA22U64-NEXT: slli a5, a5, 56 ; RVA22U64-NEXT: or a0, a0, a5 -; RVA22U64-NEXT: or a1, a1, a2 -; RVA22U64-NEXT: or a0, a0, a3 +; RVA22U64-NEXT: or a1, a1, a3 +; RVA22U64-NEXT: or a0, a0, a2 ; RVA22U64-NEXT: or a0, a0, a1 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-NEXT: vmv.v.x v8, a0 @@ -2424,26 +2427,26 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) { ; ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_high_half: ; RVA22U64-PACK: # %bb.0: -; RVA22U64-PACK-NEXT: lbu a6, 0(a0) -; RVA22U64-PACK-NEXT: lbu a7, 1(a0) -; RVA22U64-PACK-NEXT: lbu t0, 22(a0) +; RVA22U64-PACK-NEXT: lbu a1, 0(a0) +; RVA22U64-PACK-NEXT: lbu a2, 1(a0) +; RVA22U64-PACK-NEXT: lbu a3, 22(a0) ; RVA22U64-PACK-NEXT: lbu a4, 31(a0) -; RVA22U64-PACK-NEXT: lbu a5, 623(a0) -; RVA22U64-PACK-NEXT: lbu a1, 44(a0) -; RVA22U64-PACK-NEXT: lbu a2, 55(a0) -; RVA22U64-PACK-NEXT: lbu a0, 75(a0) -; RVA22U64-PACK-NEXT: packh a3, a6, a7 -; RVA22U64-PACK-NEXT: packh a4, t0, a4 ; RVA22U64-PACK-NEXT: packh a1, a1, a2 -; RVA22U64-PACK-NEXT: packh a0, a5, a0 -; RVA22U64-PACK-NEXT: packw a2, a3, a4 -; RVA22U64-PACK-NEXT: packh a3, a0, a0 -; RVA22U64-PACK-NEXT: packw a3, a3, a3 -; RVA22U64-PACK-NEXT: packw a0, a1, a0 -; RVA22U64-PACK-NEXT: pack a0, a2, a0 +; RVA22U64-PACK-NEXT: packh a2, a3, a4 +; RVA22U64-PACK-NEXT: lbu a3, 44(a0) +; RVA22U64-PACK-NEXT: lbu a4, 55(a0) +; RVA22U64-PACK-NEXT: lbu a5, 75(a0) +; RVA22U64-PACK-NEXT: packh a3, a3, a4 +; RVA22U64-PACK-NEXT: lbu a0, 623(a0) +; RVA22U64-PACK-NEXT: packh a0, a0, a5 +; RVA22U64-PACK-NEXT: packw a1, a1, a2 +; RVA22U64-PACK-NEXT: packh a2, a0, a0 +; RVA22U64-PACK-NEXT: packw a2, a2, a2 +; RVA22U64-PACK-NEXT: packw a0, a3, a0 +; RVA22U64-PACK-NEXT: pack a0, a1, a0 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-PACK-NEXT: vmv.v.x v8, a0 -; RVA22U64-PACK-NEXT: pack a0, a3, a3 +; RVA22U64-PACK-NEXT: pack a0, a2, a2 ; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-PACK-NEXT: ret ; @@ -2504,15 +2507,13 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV32-ONLY-NEXT: lbu a3, 44(a0) ; RV32-ONLY-NEXT: lbu a4, 55(a0) ; RV32-ONLY-NEXT: lbu a5, 75(a0) -; RV32-ONLY-NEXT: li a6, 255 -; RV32-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-ONLY-NEXT: vmv.s.x v0, a6 ; RV32-ONLY-NEXT: lbu a6, 82(a0) ; RV32-ONLY-NEXT: lbu a7, 93(a0) ; RV32-ONLY-NEXT: lbu t0, 105(a0) ; RV32-ONLY-NEXT: lbu a0, 161(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-ONLY-NEXT: vmv.v.x v8, a2 +; RV32-ONLY-NEXT: li a2, 255 ; RV32-ONLY-NEXT: vmv.v.x v9, a6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a7 @@ -2522,35 +2523,38 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a0 ; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a5 ; RV32-ONLY-NEXT: vslidedown.vi v8, v9, 4 +; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-ONLY-NEXT: vmv.s.x v0, a2 +; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v16i8_undef_edges: ; RV32VB: # %bb.0: ; RV32VB-NEXT: lbu a1, 623(a0) -; RV32VB-NEXT: lbu a2, 55(a0) -; RV32VB-NEXT: lbu a3, 75(a0) -; RV32VB-NEXT: lbu a4, 31(a0) -; RV32VB-NEXT: lbu a5, 44(a0) -; RV32VB-NEXT: slli a2, a2, 8 +; RV32VB-NEXT: lbu a2, 31(a0) +; RV32VB-NEXT: lbu a3, 44(a0) +; RV32VB-NEXT: lbu a4, 55(a0) +; RV32VB-NEXT: lbu a5, 75(a0) +; RV32VB-NEXT: slli a4, a4, 8 ; RV32VB-NEXT: slli a1, a1, 16 -; RV32VB-NEXT: slli a3, a3, 24 -; RV32VB-NEXT: or a2, a5, a2 -; RV32VB-NEXT: lbu a5, 82(a0) -; RV32VB-NEXT: lbu a6, 93(a0) -; RV32VB-NEXT: or a1, a3, a1 -; RV32VB-NEXT: lbu a3, 105(a0) +; RV32VB-NEXT: slli a5, a5, 24 +; RV32VB-NEXT: or a3, a3, a4 +; RV32VB-NEXT: or a1, a5, a1 +; RV32VB-NEXT: lbu a4, 82(a0) +; RV32VB-NEXT: lbu a5, 93(a0) +; RV32VB-NEXT: lbu a6, 105(a0) ; RV32VB-NEXT: lbu a0, 161(a0) -; RV32VB-NEXT: slli a6, a6, 8 -; RV32VB-NEXT: or a5, a5, a6 -; RV32VB-NEXT: slli a3, a3, 16 +; RV32VB-NEXT: slli a5, a5, 8 +; RV32VB-NEXT: or a4, a4, a5 +; RV32VB-NEXT: slli a6, a6, 16 ; RV32VB-NEXT: slli a0, a0, 24 -; RV32VB-NEXT: or a0, a0, a3 -; RV32VB-NEXT: slli a4, a4, 24 -; RV32VB-NEXT: or a1, a2, a1 -; RV32VB-NEXT: or a0, a5, a0 +; RV32VB-NEXT: or a0, a0, a6 +; RV32VB-NEXT: slli a2, a2, 24 +; RV32VB-NEXT: or a1, a3, a1 +; RV32VB-NEXT: or a0, a4, a0 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32VB-NEXT: vmv.v.x v8, a4 +; RV32VB-NEXT: vmv.v.x v8, a2 ; RV32VB-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-NEXT: vslide1down.vx v8, v8, zero @@ -2563,14 +2567,14 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV32VB-PACK-NEXT: lbu a3, 44(a0) ; RV32VB-PACK-NEXT: lbu a4, 55(a0) ; RV32VB-PACK-NEXT: lbu a5, 75(a0) -; RV32VB-PACK-NEXT: lbu a6, 82(a0) -; RV32VB-PACK-NEXT: lbu a7, 93(a0) -; RV32VB-PACK-NEXT: lbu t0, 105(a0) -; RV32VB-PACK-NEXT: lbu a0, 161(a0) ; RV32VB-PACK-NEXT: packh a3, a3, a4 ; RV32VB-PACK-NEXT: packh a1, a1, a5 -; RV32VB-PACK-NEXT: packh a4, a6, a7 -; RV32VB-PACK-NEXT: packh a0, t0, a0 +; RV32VB-PACK-NEXT: lbu a4, 82(a0) +; RV32VB-PACK-NEXT: lbu a5, 93(a0) +; RV32VB-PACK-NEXT: lbu a6, 105(a0) +; RV32VB-PACK-NEXT: lbu a0, 161(a0) +; RV32VB-PACK-NEXT: packh a4, a4, a5 +; RV32VB-PACK-NEXT: packh a0, a6, a0 ; RV32VB-PACK-NEXT: packh a5, a0, a0 ; RV32VB-PACK-NEXT: packh a2, a0, a2 ; RV32VB-PACK-NEXT: pack a2, a5, a2 @@ -2591,15 +2595,13 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV64V-ONLY-NEXT: lbu a3, 44(a0) ; RV64V-ONLY-NEXT: lbu a4, 55(a0) ; RV64V-ONLY-NEXT: lbu a5, 75(a0) -; RV64V-ONLY-NEXT: li a6, 255 -; RV64V-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64V-ONLY-NEXT: vmv.s.x v0, a6 ; RV64V-ONLY-NEXT: lbu a6, 82(a0) ; RV64V-ONLY-NEXT: lbu a7, 93(a0) ; RV64V-ONLY-NEXT: lbu t0, 105(a0) ; RV64V-ONLY-NEXT: lbu a0, 161(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64V-ONLY-NEXT: vmv.v.x v8, a2 +; RV64V-ONLY-NEXT: li a2, 255 ; RV64V-ONLY-NEXT: vmv.v.x v9, a6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a7 @@ -2609,65 +2611,68 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a0 ; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a5 ; RV64V-ONLY-NEXT: vslidedown.vi v8, v9, 4 +; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64V-ONLY-NEXT: vmv.s.x v0, a2 +; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v16i8_undef_edges: ; RVA22U64: # %bb.0: +; RVA22U64-NEXT: lbu a1, 623(a0) ; RVA22U64-NEXT: lbu a6, 31(a0) -; RVA22U64-NEXT: lbu a2, 44(a0) -; RVA22U64-NEXT: lbu a3, 55(a0) -; RVA22U64-NEXT: lbu a4, 623(a0) +; RVA22U64-NEXT: lbu a3, 44(a0) +; RVA22U64-NEXT: lbu a4, 55(a0) ; RVA22U64-NEXT: lbu a5, 75(a0) -; RVA22U64-NEXT: slli a2, a2, 32 -; RVA22U64-NEXT: slli a3, a3, 40 -; RVA22U64-NEXT: slli a4, a4, 48 +; RVA22U64-NEXT: slli a3, a3, 32 +; RVA22U64-NEXT: slli a4, a4, 40 +; RVA22U64-NEXT: slli a1, a1, 48 ; RVA22U64-NEXT: slli a5, a5, 56 -; RVA22U64-NEXT: or a2, a2, a3 -; RVA22U64-NEXT: lbu a3, 82(a0) -; RVA22U64-NEXT: lbu a1, 93(a0) -; RVA22U64-NEXT: or a4, a4, a5 -; RVA22U64-NEXT: lbu a5, 105(a0) +; RVA22U64-NEXT: or a3, a3, a4 +; RVA22U64-NEXT: or a1, a1, a5 +; RVA22U64-NEXT: lbu a4, 82(a0) +; RVA22U64-NEXT: lbu a5, 93(a0) +; RVA22U64-NEXT: lbu a2, 105(a0) ; RVA22U64-NEXT: lbu a0, 161(a0) -; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: or a1, a1, a3 -; RVA22U64-NEXT: slli a5, a5, 16 +; RVA22U64-NEXT: slli a5, a5, 8 +; RVA22U64-NEXT: or a4, a4, a5 +; RVA22U64-NEXT: slli a2, a2, 16 ; RVA22U64-NEXT: slli a0, a0, 24 -; RVA22U64-NEXT: or a0, a0, a5 +; RVA22U64-NEXT: or a0, a0, a2 ; RVA22U64-NEXT: slli a6, a6, 24 -; RVA22U64-NEXT: or a2, a2, a4 -; RVA22U64-NEXT: add.uw a2, a6, a2 -; RVA22U64-NEXT: or a0, a0, a1 +; RVA22U64-NEXT: or a1, a1, a3 +; RVA22U64-NEXT: add.uw a1, a6, a1 +; RVA22U64-NEXT: or a0, a0, a4 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-NEXT: vmv.v.x v8, a2 +; RVA22U64-NEXT: vmv.v.x v8, a1 ; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-NEXT: ret ; ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_edges: ; RVA22U64-PACK: # %bb.0: -; RVA22U64-PACK-NEXT: lbu a7, 623(a0) -; RVA22U64-PACK-NEXT: lbu a6, 31(a0) -; RVA22U64-PACK-NEXT: lbu t0, 44(a0) +; RVA22U64-PACK-NEXT: lbu a1, 623(a0) +; RVA22U64-PACK-NEXT: lbu a2, 31(a0) +; RVA22U64-PACK-NEXT: lbu a3, 44(a0) ; RVA22U64-PACK-NEXT: lbu a4, 55(a0) ; RVA22U64-PACK-NEXT: lbu a5, 75(a0) -; RVA22U64-PACK-NEXT: lbu a2, 82(a0) -; RVA22U64-PACK-NEXT: lbu a1, 93(a0) +; RVA22U64-PACK-NEXT: packh a6, a3, a4 +; RVA22U64-PACK-NEXT: packh a1, a1, a5 +; RVA22U64-PACK-NEXT: lbu a4, 82(a0) +; RVA22U64-PACK-NEXT: lbu a5, 93(a0) ; RVA22U64-PACK-NEXT: lbu a3, 105(a0) ; RVA22U64-PACK-NEXT: lbu a0, 161(a0) -; RVA22U64-PACK-NEXT: packh a4, t0, a4 -; RVA22U64-PACK-NEXT: packh a5, a7, a5 -; RVA22U64-PACK-NEXT: packh a1, a2, a1 +; RVA22U64-PACK-NEXT: packh a4, a4, a5 ; RVA22U64-PACK-NEXT: packh a0, a3, a0 -; RVA22U64-PACK-NEXT: packh a2, a0, a0 -; RVA22U64-PACK-NEXT: packh a3, a0, a6 -; RVA22U64-PACK-NEXT: packw a3, a2, a3 -; RVA22U64-PACK-NEXT: packw a2, a2, a2 -; RVA22U64-PACK-NEXT: packw a4, a4, a5 -; RVA22U64-PACK-NEXT: packw a0, a1, a0 -; RVA22U64-PACK-NEXT: pack a1, a3, a4 +; RVA22U64-PACK-NEXT: packh a3, a0, a0 +; RVA22U64-PACK-NEXT: packh a2, a0, a2 +; RVA22U64-PACK-NEXT: packw a2, a3, a2 +; RVA22U64-PACK-NEXT: packw a3, a3, a3 +; RVA22U64-PACK-NEXT: packw a1, a6, a1 +; RVA22U64-PACK-NEXT: packw a0, a4, a0 +; RVA22U64-PACK-NEXT: pack a1, a2, a1 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RVA22U64-PACK-NEXT: vmv.v.x v8, a1 -; RVA22U64-PACK-NEXT: pack a0, a0, a2 +; RVA22U64-PACK-NEXT: pack a0, a0, a3 ; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-PACK-NEXT: ret ; @@ -2678,15 +2683,13 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV64ZVE32-NEXT: lbu a3, 44(a0) ; RV64ZVE32-NEXT: lbu a4, 55(a0) ; RV64ZVE32-NEXT: lbu a5, 75(a0) -; RV64ZVE32-NEXT: li a6, 255 -; RV64ZVE32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32-NEXT: vmv.s.x v0, a6 ; RV64ZVE32-NEXT: lbu a6, 82(a0) ; RV64ZVE32-NEXT: lbu a7, 93(a0) ; RV64ZVE32-NEXT: lbu t0, 105(a0) ; RV64ZVE32-NEXT: lbu a0, 161(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64ZVE32-NEXT: vmv.v.x v8, a2 +; RV64ZVE32-NEXT: li a2, 255 ; RV64ZVE32-NEXT: vmv.v.x v9, a6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a7 @@ -2696,6 +2699,9 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) { ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a0 ; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a5 ; RV64ZVE32-NEXT: vslidedown.vi v8, v9, 4 +; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v0, a2 +; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64ZVE32-NEXT: ret %p4 = getelementptr i8, ptr %p, i32 31 @@ -2741,13 +2747,11 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV32-ONLY-NEXT: lbu a6, 82(a0) ; RV32-ONLY-NEXT: lbu a7, 93(a0) ; RV32-ONLY-NEXT: lbu t0, 124(a0) -; RV32-ONLY-NEXT: li t1, 255 -; RV32-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-ONLY-NEXT: vmv.s.x v0, t1 ; RV32-ONLY-NEXT: lbu t1, 144(a0) ; RV32-ONLY-NEXT: lbu a0, 154(a0) -; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV32-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-ONLY-NEXT: vmv.v.x v8, a1 +; RV32-ONLY-NEXT: li a1, 255 ; RV32-ONLY-NEXT: vmv.v.x v9, a6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a7 @@ -2761,37 +2765,40 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t1 ; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a5 ; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a0 +; RV32-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-ONLY-NEXT: vmv.s.x v0, a1 +; RV32-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV32-ONLY-NEXT: ret ; ; RV32VB-LABEL: buildvec_v16i8_loads_undef_scattered: ; RV32VB: # %bb.0: -; RV32VB-NEXT: lbu a1, 1(a0) -; RV32VB-NEXT: lbu a2, 0(a0) +; RV32VB-NEXT: lbu a1, 0(a0) +; RV32VB-NEXT: lbu a2, 1(a0) ; RV32VB-NEXT: lbu a3, 44(a0) ; RV32VB-NEXT: lbu a4, 55(a0) -; RV32VB-NEXT: slli a1, a1, 8 -; RV32VB-NEXT: or a1, a2, a1 -; RV32VB-NEXT: lbu a2, 75(a0) -; RV32VB-NEXT: lbu a5, 82(a0) -; RV32VB-NEXT: lbu a6, 93(a0) -; RV32VB-NEXT: lbu a7, 124(a0) +; RV32VB-NEXT: slli a2, a2, 8 ; RV32VB-NEXT: slli a4, a4, 8 +; RV32VB-NEXT: or a1, a1, a2 ; RV32VB-NEXT: or a3, a3, a4 -; RV32VB-NEXT: lbu a4, 144(a0) +; RV32VB-NEXT: lbu a2, 75(a0) +; RV32VB-NEXT: lbu a4, 82(a0) +; RV32VB-NEXT: lbu a5, 93(a0) +; RV32VB-NEXT: lbu a6, 124(a0) +; RV32VB-NEXT: slli a5, a5, 8 +; RV32VB-NEXT: or a4, a4, a5 +; RV32VB-NEXT: lbu a5, 144(a0) ; RV32VB-NEXT: lbu a0, 154(a0) -; RV32VB-NEXT: slli a6, a6, 8 -; RV32VB-NEXT: or a5, a5, a6 -; RV32VB-NEXT: slli a4, a4, 16 +; RV32VB-NEXT: slli a5, a5, 16 ; RV32VB-NEXT: slli a0, a0, 24 -; RV32VB-NEXT: or a0, a0, a4 +; RV32VB-NEXT: or a0, a0, a5 ; RV32VB-NEXT: slli a2, a2, 24 ; RV32VB-NEXT: or a2, a3, a2 -; RV32VB-NEXT: or a0, a7, a0 +; RV32VB-NEXT: or a0, a6, a0 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-NEXT: vmv.v.x v8, a1 ; RV32VB-NEXT: vslide1down.vx v8, v8, a2 -; RV32VB-NEXT: vslide1down.vx v8, v8, a5 +; RV32VB-NEXT: vslide1down.vx v8, v8, a4 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-NEXT: ret ; @@ -2801,26 +2808,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV32VB-PACK-NEXT: lbu a2, 1(a0) ; RV32VB-PACK-NEXT: lbu a3, 44(a0) ; RV32VB-PACK-NEXT: lbu a4, 55(a0) -; RV32VB-PACK-NEXT: lbu a5, 75(a0) -; RV32VB-PACK-NEXT: lbu a6, 82(a0) -; RV32VB-PACK-NEXT: lbu a7, 93(a0) ; RV32VB-PACK-NEXT: packh a1, a1, a2 -; RV32VB-PACK-NEXT: lbu a2, 144(a0) -; RV32VB-PACK-NEXT: lbu t0, 154(a0) -; RV32VB-PACK-NEXT: packh a3, a3, a4 -; RV32VB-PACK-NEXT: lbu a0, 124(a0) -; RV32VB-PACK-NEXT: packh a4, a6, a7 -; RV32VB-PACK-NEXT: packh a2, a2, t0 -; RV32VB-PACK-NEXT: packh a5, a0, a5 -; RV32VB-PACK-NEXT: pack a3, a3, a5 -; RV32VB-PACK-NEXT: packh a5, a0, a0 -; RV32VB-PACK-NEXT: packh a0, a0, a0 -; RV32VB-PACK-NEXT: pack a0, a0, a2 -; RV32VB-PACK-NEXT: pack a1, a1, a5 +; RV32VB-PACK-NEXT: packh a2, a3, a4 +; RV32VB-PACK-NEXT: lbu a3, 75(a0) +; RV32VB-PACK-NEXT: lbu a4, 82(a0) +; RV32VB-PACK-NEXT: lbu a5, 93(a0) +; RV32VB-PACK-NEXT: lbu a6, 124(a0) +; RV32VB-PACK-NEXT: packh a4, a4, a5 +; RV32VB-PACK-NEXT: lbu a5, 144(a0) +; RV32VB-PACK-NEXT: lbu a0, 154(a0) +; RV32VB-PACK-NEXT: packh a0, a5, a0 +; RV32VB-PACK-NEXT: packh a3, a0, a3 +; RV32VB-PACK-NEXT: pack a2, a2, a3 +; RV32VB-PACK-NEXT: packh a3, a0, a0 +; RV32VB-PACK-NEXT: packh a5, a6, a0 +; RV32VB-PACK-NEXT: pack a0, a5, a0 +; RV32VB-PACK-NEXT: pack a1, a1, a3 ; RV32VB-PACK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32VB-PACK-NEXT: vmv.v.x v8, a1 -; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a3 -; RV32VB-PACK-NEXT: pack a1, a4, a5 +; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a2 +; RV32VB-PACK-NEXT: pack a1, a4, a3 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a1 ; RV32VB-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RV32VB-PACK-NEXT: ret @@ -2835,13 +2842,11 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV64V-ONLY-NEXT: lbu a6, 82(a0) ; RV64V-ONLY-NEXT: lbu a7, 93(a0) ; RV64V-ONLY-NEXT: lbu t0, 124(a0) -; RV64V-ONLY-NEXT: li t1, 255 -; RV64V-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64V-ONLY-NEXT: vmv.s.x v0, t1 ; RV64V-ONLY-NEXT: lbu t1, 144(a0) ; RV64V-ONLY-NEXT: lbu a0, 154(a0) -; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64V-ONLY-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64V-ONLY-NEXT: vmv.v.x v8, a1 +; RV64V-ONLY-NEXT: li a1, 255 ; RV64V-ONLY-NEXT: vmv.v.x v9, a6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a7 @@ -2855,39 +2860,42 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t1 ; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a5 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a0 +; RV64V-ONLY-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64V-ONLY-NEXT: vmv.s.x v0, a1 +; RV64V-ONLY-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64V-ONLY-NEXT: ret ; ; RVA22U64-LABEL: buildvec_v16i8_loads_undef_scattered: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a1, 1(a0) -; RVA22U64-NEXT: lbu a2, 0(a0) +; RVA22U64-NEXT: lbu a1, 0(a0) +; RVA22U64-NEXT: lbu a2, 1(a0) ; RVA22U64-NEXT: lbu a3, 44(a0) ; RVA22U64-NEXT: lbu a4, 55(a0) -; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: or a6, a2, a1 -; RVA22U64-NEXT: lbu a7, 75(a0) -; RVA22U64-NEXT: lbu a5, 82(a0) -; RVA22U64-NEXT: lbu a1, 93(a0) -; RVA22U64-NEXT: lbu a2, 124(a0) +; RVA22U64-NEXT: slli a2, a2, 8 ; RVA22U64-NEXT: slli a3, a3, 32 ; RVA22U64-NEXT: slli a4, a4, 40 +; RVA22U64-NEXT: or a6, a1, a2 ; RVA22U64-NEXT: or a3, a3, a4 -; RVA22U64-NEXT: lbu a4, 144(a0) +; RVA22U64-NEXT: lbu a2, 75(a0) +; RVA22U64-NEXT: lbu a4, 82(a0) +; RVA22U64-NEXT: lbu a5, 93(a0) +; RVA22U64-NEXT: lbu a1, 124(a0) +; RVA22U64-NEXT: slli a5, a5, 8 +; RVA22U64-NEXT: or a4, a4, a5 +; RVA22U64-NEXT: lbu a5, 144(a0) ; RVA22U64-NEXT: lbu a0, 154(a0) -; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: or a1, a1, a5 -; RVA22U64-NEXT: slli a4, a4, 48 +; RVA22U64-NEXT: slli a5, a5, 48 ; RVA22U64-NEXT: slli a0, a0, 56 -; RVA22U64-NEXT: or a0, a0, a4 -; RVA22U64-NEXT: slli a7, a7, 56 -; RVA22U64-NEXT: or a3, a7, a3 -; RVA22U64-NEXT: slli a2, a2, 32 -; RVA22U64-NEXT: or a0, a0, a2 -; RVA22U64-NEXT: or a2, a6, a3 +; RVA22U64-NEXT: or a0, a0, a5 +; RVA22U64-NEXT: slli a2, a2, 56 +; RVA22U64-NEXT: or a2, a2, a3 +; RVA22U64-NEXT: slli a1, a1, 32 ; RVA22U64-NEXT: or a0, a0, a1 +; RVA22U64-NEXT: or a1, a6, a2 +; RVA22U64-NEXT: or a0, a0, a4 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-NEXT: vmv.v.x v8, a2 +; RVA22U64-NEXT: vmv.v.x v8, a1 ; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-NEXT: ret ; @@ -2895,29 +2903,29 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RVA22U64-PACK: # %bb.0: ; RVA22U64-PACK-NEXT: lbu a1, 0(a0) ; RVA22U64-PACK-NEXT: lbu a2, 1(a0) -; RVA22U64-PACK-NEXT: lbu a7, 44(a0) -; RVA22U64-PACK-NEXT: lbu t0, 55(a0) -; RVA22U64-PACK-NEXT: lbu a6, 75(a0) -; RVA22U64-PACK-NEXT: lbu a5, 82(a0) -; RVA22U64-PACK-NEXT: lbu a3, 93(a0) -; RVA22U64-PACK-NEXT: packh t1, a1, a2 -; RVA22U64-PACK-NEXT: lbu a2, 144(a0) -; RVA22U64-PACK-NEXT: lbu a4, 154(a0) -; RVA22U64-PACK-NEXT: packh a1, a7, t0 -; RVA22U64-PACK-NEXT: lbu a0, 124(a0) -; RVA22U64-PACK-NEXT: packh a3, a5, a3 -; RVA22U64-PACK-NEXT: packh a2, a2, a4 -; RVA22U64-PACK-NEXT: packh a4, a0, a6 -; RVA22U64-PACK-NEXT: packw a1, a1, a4 -; RVA22U64-PACK-NEXT: packh a4, a0, a0 -; RVA22U64-PACK-NEXT: packh a0, a0, a0 -; RVA22U64-PACK-NEXT: packw a5, t1, a4 -; RVA22U64-PACK-NEXT: packw a0, a0, a2 -; RVA22U64-PACK-NEXT: packw a2, a3, a4 -; RVA22U64-PACK-NEXT: pack a1, a5, a1 -; RVA22U64-PACK-NEXT: pack a0, a2, a0 +; RVA22U64-PACK-NEXT: lbu a3, 44(a0) +; RVA22U64-PACK-NEXT: lbu a4, 55(a0) +; RVA22U64-PACK-NEXT: packh a6, a1, a2 +; RVA22U64-PACK-NEXT: packh a2, a3, a4 +; RVA22U64-PACK-NEXT: lbu a3, 75(a0) +; RVA22U64-PACK-NEXT: lbu a4, 82(a0) +; RVA22U64-PACK-NEXT: lbu a5, 93(a0) +; RVA22U64-PACK-NEXT: lbu a1, 124(a0) +; RVA22U64-PACK-NEXT: packh a4, a4, a5 +; RVA22U64-PACK-NEXT: lbu a5, 144(a0) +; RVA22U64-PACK-NEXT: lbu a0, 154(a0) +; RVA22U64-PACK-NEXT: packh a0, a5, a0 +; RVA22U64-PACK-NEXT: packh a3, a0, a3 +; RVA22U64-PACK-NEXT: packw a2, a2, a3 +; RVA22U64-PACK-NEXT: packh a3, a0, a0 +; RVA22U64-PACK-NEXT: packh a1, a1, a0 +; RVA22U64-PACK-NEXT: packw a5, a6, a3 +; RVA22U64-PACK-NEXT: packw a0, a1, a0 +; RVA22U64-PACK-NEXT: packw a1, a4, a3 +; RVA22U64-PACK-NEXT: pack a2, a5, a2 +; RVA22U64-PACK-NEXT: pack a0, a1, a0 ; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-PACK-NEXT: vmv.v.x v8, a1 +; RVA22U64-PACK-NEXT: vmv.v.x v8, a2 ; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a0 ; RVA22U64-PACK-NEXT: ret ; @@ -2931,13 +2939,11 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV64ZVE32-NEXT: lbu a6, 82(a0) ; RV64ZVE32-NEXT: lbu a7, 93(a0) ; RV64ZVE32-NEXT: lbu t0, 124(a0) -; RV64ZVE32-NEXT: li t1, 255 -; RV64ZVE32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32-NEXT: vmv.s.x v0, t1 ; RV64ZVE32-NEXT: lbu t1, 144(a0) ; RV64ZVE32-NEXT: lbu a0, 154(a0) -; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; RV64ZVE32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64ZVE32-NEXT: vmv.v.x v8, a1 +; RV64ZVE32-NEXT: li a1, 255 ; RV64ZVE32-NEXT: vmv.v.x v9, a6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a7 @@ -2951,6 +2957,9 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t1 ; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a5 ; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v0, a1 +; RV64ZVE32-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 8, v0.t ; RV64ZVE32-NEXT: ret %p2 = getelementptr i8, ptr %p, i32 1 @@ -3011,13 +3020,13 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RV32-ONLY-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV32-ONLY-NEXT: vmv.v.x v8, a0 ; RV32-ONLY-NEXT: vmv.v.x v9, a4 -; RV32-ONLY-NEXT: vmv.v.i v0, 15 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a1 ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a5 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a6 ; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a3 ; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a7 +; RV32-ONLY-NEXT: vmv.v.i v0, 15 ; RV32-ONLY-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV32-ONLY-NEXT: ret ; @@ -3064,13 +3073,13 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RV64V-ONLY-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV64V-ONLY-NEXT: vmv.v.x v8, a0 ; RV64V-ONLY-NEXT: vmv.v.x v9, a4 -; RV64V-ONLY-NEXT: vmv.v.i v0, 15 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1 ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a5 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a6 ; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a3 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a7 +; RV64V-ONLY-NEXT: vmv.v.i v0, 15 ; RV64V-ONLY-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64V-ONLY-NEXT: ret ; @@ -3119,13 +3128,13 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 % ; RV64ZVE32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; RV64ZVE32-NEXT: vmv.v.x v8, a0 ; RV64ZVE32-NEXT: vmv.v.x v9, a4 -; RV64ZVE32-NEXT: vmv.v.i v0, 15 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1 ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a5 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a6 ; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a3 ; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a7 +; RV64ZVE32-NEXT: vmv.v.i v0, 15 ; RV64ZVE32-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32-NEXT: ret %v1 = insertelement <8 x i8> poison, i8 %e1, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll index da7cdf3ba8ec0..f01ead3fea62f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -51,15 +51,20 @@ define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) { define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) { ; V128-LABEL: interleave_v2i64: ; V128: # %bb.0: +; V128-NEXT: csrr a0, vlenb ; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; V128-NEXT: vmv1r.v v12, v9 -; V128-NEXT: vid.v v9 +; V128-NEXT: vid.v v10 +; V128-NEXT: srli a0, a0, 3 +; V128-NEXT: vsrl.vi v10, v10, 1 +; V128-NEXT: vslidedown.vx v11, v10, a0 +; V128-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; V128-NEXT: vrgatherei16.vv v13, v9, v11 +; V128-NEXT: vrgatherei16.vv v12, v9, v10 +; V128-NEXT: vrgatherei16.vv v15, v8, v11 +; V128-NEXT: vrgatherei16.vv v14, v8, v10 ; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: vsrl.vi v14, v9, 1 -; V128-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; V128-NEXT: vrgatherei16.vv v10, v8, v14 -; V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t -; V128-NEXT: vmv.v.v v8, v10 +; V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; V128-NEXT: vmerge.vvm v8, v14, v12, v0 ; V128-NEXT: ret ; ; RV32-V512-LABEL: interleave_v2i64: @@ -67,9 +72,9 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) { ; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; RV32-V512-NEXT: vid.v v10 ; RV32-V512-NEXT: vsrl.vi v11, v10, 1 -; RV32-V512-NEXT: vmv.v.i v0, 10 ; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 +; RV32-V512-NEXT: vmv.v.i v0, 10 ; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t ; RV32-V512-NEXT: vmv.v.v v8, v10 ; RV32-V512-NEXT: ret @@ -79,8 +84,8 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) { ; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu ; RV64-V512-NEXT: vid.v v10 ; RV64-V512-NEXT: vsrl.vi v11, v10, 1 -; RV64-V512-NEXT: vmv.v.i v0, 10 ; RV64-V512-NEXT: vrgather.vv v10, v8, v11 +; RV64-V512-NEXT: vmv.v.i v0, 10 ; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t ; RV64-V512-NEXT: vmv.v.v v8, v10 ; RV64-V512-NEXT: ret @@ -416,8 +421,8 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; V128-NEXT: vzext.vf2 v8, v24 ; V128-NEXT: addi a1, a1, -1366 ; V128-NEXT: vzext.vf2 v24, v0 -; V128-NEXT: vmv.s.x v0, a1 ; V128-NEXT: vsll.vx v8, v8, a0 +; V128-NEXT: vmv.s.x v0, a1 ; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; V128-NEXT: vmerge.vvm v24, v24, v8, v0 ; V128-NEXT: addi a0, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index a5039c58fccb1..af2ac99354db1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -86,8 +86,8 @@ define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) { ; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v11, (a0) -; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret @@ -155,20 +155,18 @@ define <8 x i64> @vrgather_permute_shuffle_uv_v8i64(<8 x i64> %x) { define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) { ; RV32-LABEL: vrgather_shuffle_vv_v8i64: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vmv.v.i v16, 2 +; RV32-NEXT: li a0, 5 +; RV32-NEXT: vslide1down.vx v20, v16, a0 ; RV32-NEXT: lui a0, %hi(.LCPI11_0) ; RV32-NEXT: addi a0, a0, %lo(.LCPI11_0) -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v20, (a0) -; RV32-NEXT: vmv.v.i v21, 2 +; RV32-NEXT: vle16.v v21, (a0) ; RV32-NEXT: li a0, 164 -; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v8, v20 -; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: li a0, 5 -; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32-NEXT: vslide1down.vx v8, v21, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v16, v12, v8, v0.t +; RV32-NEXT: vrgatherei16.vv v16, v8, v21 +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vrgatherei16.vv v16, v12, v20, v0.t ; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: ret ; @@ -211,8 +209,8 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) { ; RV32-NEXT: addi a0, a0, %lo(.LCPI12_1) ; RV32-NEXT: vle16.v v21, (a0) ; RV32-NEXT: li a0, 113 -; RV32-NEXT: vmv.s.x v0, a0 ; RV32-NEXT: vrgatherei16.vv v12, v16, v20 +; RV32-NEXT: vmv.s.x v0, a0 ; RV32-NEXT: vrgatherei16.vv v12, v8, v21, v0.t ; RV32-NEXT: vmv.v.v v8, v12 ; RV32-NEXT: ret @@ -365,10 +363,10 @@ define <8 x i8> @splat_ve4_ins_i1ve3(<8 x i8> %v) { define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: splat_ve2_we0: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 66 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vrgather.vi v10, v8, 2 +; CHECK-NEXT: li a0, 66 +; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vrgather.vi v10, v9, 0, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret @@ -385,9 +383,9 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma ; CHECK-NEXT: vmv.s.x v11, a0 ; CHECK-NEXT: li a0, 66 -; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vrgather.vi v10, v9, 0, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret @@ -400,10 +398,10 @@ define <8 x i8> @splat_ve2_we0_ins_i0we4(<8 x i8> %v, <8 x i8> %w) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vrgather.vi v10, v8, 2 -; CHECK-NEXT: li a0, 67 -; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 4 +; CHECK-NEXT: li a0, 67 +; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vv v10, v9, v8, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 @@ -420,9 +418,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v11, a0 ; CHECK-NEXT: li a0, 66 -; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vrgather.vi v10, v9, 0, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret @@ -434,16 +432,16 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: splat_ve2_we0_ins_i2we4: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v10, 4 +; CHECK-NEXT: vmv.v.i v11, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vrgather.vi v10, v8, 2 ; CHECK-NEXT: li a0, 70 ; CHECK-NEXT: vsetivli zero, 3, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v11, v10, 2 +; CHECK-NEXT: vslideup.vi v12, v11, 2 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vi v10, v8, 2 -; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t +; CHECK-NEXT: vrgather.vv v10, v9, v12, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -453,13 +451,13 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: splat_ve2_we0_ins_i2ve4_i5we6: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI26_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: li a0, 20 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: lui a0, %hi(.LCPI26_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0) ; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 +; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: vrgather.vv v8, v9, v10 ; CHECK-NEXT: ret %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -670,10 +668,10 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: merge_non_contiguous_slideup_slidedown: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, -22 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vslidedown.vi v8, v8, 2 +; CHECK-NEXT: li a0, -22 +; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -684,13 +682,13 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: unmergable: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI46_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: li a0, 84 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: lui a0, %hi(.LCPI46_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) ; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 +; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: vrgather.vv v8, v9, v10 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> @@ -817,13 +815,17 @@ define <8 x i32> @shuffle_spread2_singlesrc_e32_index1(<8 x i32> %v) { define <8 x i32> @shuffle_spread2_singlesrc_e32_index2(<8 x i32> %v) { ; CHECK-LABEL: shuffle_spread2_singlesrc_e32_index2: ; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vsrl.vi v10, v10, 1 -; CHECK-NEXT: vadd.vi v12, v10, -1 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsrl.vi v9, v9, 1 +; CHECK-NEXT: vadd.vi v9, v9, -1 +; CHECK-NEXT: vslidedown.vx v10, v9, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v10, v8, v9 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -833,12 +835,16 @@ define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) { ; CHECK-LABEL: shuffle_spread3_singlesrc_e32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: vslide1down.vx v12, v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vslide1down.vx v9, v9, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vslidedown.vx v10, v9, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v10, v8, v9 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -848,12 +854,16 @@ define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) { define <8 x i32> @shuffle_spread4_singlesrc_e32(<8 x i32> %v) { ; CHECK-LABEL: shuffle_spread4_singlesrc_e32: ; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vsrl.vi v12, v10, 2 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsrl.vi v9, v9, 2 +; CHECK-NEXT: vslidedown.vx v10, v9, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v10, v8, v9 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -977,15 +987,19 @@ define <8 x i32> @shuffle_repeat3_singlesrc_e32(<8 x i32> %v) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmv.v.i v0, 7 -; CHECK-NEXT: vmv.v.i v11, 1 +; CHECK-NEXT: vmv.v.i v10, 1 ; CHECK-NEXT: li a0, 192 -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmerge.vim v11, v11, 0, v0 -; CHECK-NEXT: vmv.v.v v0, v10 -; CHECK-NEXT: vmerge.vim v12, v11, 2, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 +; CHECK-NEXT: vmv.v.v v0, v9 +; CHECK-NEXT: vmerge.vim v9, v10, 2, v0 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vslidedown.vx v10, v9, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v10, v8, v9 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -994,12 +1008,16 @@ define <8 x i32> @shuffle_repeat3_singlesrc_e32(<8 x i32> %v) { define <8 x i32> @shuffle_repeat4_singlesrc_e32(<8 x i32> %v) { ; CHECK-LABEL: shuffle_repeat4_singlesrc_e32: ; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vsrl.vi v12, v10, 2 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsrl.vi v9, v9, 2 +; CHECK-NEXT: vslidedown.vx v10, v9, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v10, v8, v9 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -1060,18 +1078,18 @@ define <16 x i64> @shuffle_zipodd_v16i64(<16 x i64> %v1, <16 x i64> %v2) { define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) { ; CHECK-LABEL: shuffle_disjoint_lanes: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI74_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI74_0) -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: lui a0, 11 ; CHECK-NEXT: addi a0, a0, -1366 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: lui a0, %hi(.LCPI74_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI74_0) ; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vsext.vf2 v18, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v8, v12, v18 +; CHECK-NEXT: vrgatherei16.vv v8, v12, v16 ; CHECK-NEXT: ret %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> ret <16 x i32> %out @@ -1098,12 +1116,12 @@ define <16 x i32> @shuffle_disjoint_lanes_one_broadcast(<16 x i32> %v, <16 x i32 ; CHECK-NEXT: lui a0, %hi(.LCPI76_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI76_0) ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vle16.v v20, (a0) +; CHECK-NEXT: vrgather.vi v16, v8, 7 +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, 15 ; CHECK-NEXT: addi a0, a0, 240 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vi v16, v8, 7 -; CHECK-NEXT: vrgatherei16.vv v16, v12, v20, v0.t +; CHECK-NEXT: vrgatherei16.vv v16, v12, v8, v0.t ; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret %out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> @@ -1113,14 +1131,14 @@ define <16 x i32> @shuffle_disjoint_lanes_one_broadcast(<16 x i32> %v, <16 x i32 define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) { ; CHECK-LABEL: shuffle_disjoint_lanes_one_splat: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI77_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI77_0) ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vle16.v v16, (a1) -; CHECK-NEXT: lui a1, 15 -; CHECK-NEXT: addi a1, a1, 240 -; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: lui a0, %hi(.LCPI77_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI77_0) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: lui a0, 15 +; CHECK-NEXT: addi a0, a0, 240 +; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -1249,14 +1267,14 @@ define void @shuffle_i128_ldst(ptr %p) { define void @shuffle_i256_ldst(ptr %p) { ; CHECK-LABEL: shuffle_i256_ldst: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: lui a1, %hi(.LCPI80_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI80_0) -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: vsext.vf2 v18, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v24, v16, v10 +; CHECK-NEXT: vrgatherei16.vv v24, v8, v18 ; CHECK-NEXT: vse64.v v24, (a0) ; CHECK-NEXT: ret %a = load <4 x i256>, ptr %p @@ -1291,12 +1309,24 @@ define void @shuffle_i128_splat(ptr %p) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: lui a2, 16 +; CHECK-NEXT: srli a1, a1, 3 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vmv.v.x v9, a2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v10, v9, a1 +; CHECK-NEXT: vslidedown.vx v11, v10, a1 +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v13, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 +; CHECK-NEXT: vrgatherei16.vv v14, v8, v11 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v9, v11, a1 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v15, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v16, v8, v12 -; CHECK-NEXT: vse64.v v16, (a0) +; CHECK-NEXT: vse64.v v12, (a0) ; CHECK-NEXT: ret %a = load <4 x i128>, ptr %p %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> @@ -1361,10 +1391,45 @@ define <16 x i32> @shuffle_m2_prefix(<16 x i32> %a) { ; CHECK-NEXT: lui a0, %hi(.LCPI85_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI85_0) ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle16.v v14, (a0) -; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgatherei16.vv v12, v8, v10 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %out = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> ret <16 x i32> %out } + +define <4 x i16> @vmerge_1(<4 x i16> %x) { +; CHECK-LABEL: vmerge_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v0, 6 +; CHECK-NEXT: vmerge.vim v8, v8, 5, v0 +; CHECK-NEXT: ret + %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> + ret <4 x i16> %s +} + +define <4 x i16> @vmerge_2(<4 x i16> %x) { +; CHECK-LABEL: vmerge_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v0, 9 +; CHECK-NEXT: vmv.v.i v9, 5 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> + ret <4 x i16> %s +} + +define <4 x i16> @vmerge_3(<4 x i16> %x) { +; CHECK-LABEL: vmerge_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vmv.v.i v0, 6 +; CHECK-NEXT: vmv.v.i v9, 5 +; CHECK-NEXT: vrgather.vi v8, v9, 1, v0.t +; CHECK-NEXT: ret + %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> + ret <4 x i16> %s +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll index 32c1f2ca32fab..a5e730d47395d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll @@ -77,10 +77,10 @@ define void @gather_const_v2i64(ptr %x) { define void @gather_const_v64i8(ptr %x) { ; CHECK-LABEL: gather_const_v64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lbu a1, 32(a0) -; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: lbu a2, 32(a0) +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a2 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x @@ -94,10 +94,10 @@ define void @gather_const_v64i8(ptr %x) { define void @gather_const_v16i16(ptr %x) { ; CHECK-LABEL: gather_const_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 50(a0) -; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: lh a2, 50(a0) +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a2 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 392709fdb4cf7..e6514cfe7d473 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1046,46 +1046,45 @@ define void @mulhu_v16i8(ptr %x) { ; CHECK-LABEL: mulhu_v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: lui a1, 3 -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: lui a2, %hi(.LCPI65_0) -; CHECK-NEXT: addi a2, a2, %lo(.LCPI65_0) -; CHECK-NEXT: vle8.v v11, (a2) +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: li a2, 513 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.s.x v8, a2 ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: addi a1, a1, -2044 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: addi a1, a2, 32 -; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: li a1, -128 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vxm v12, v10, a1, v0 -; CHECK-NEXT: li a1, 513 -; CHECK-NEXT: vmv.v.i v13, 4 +; CHECK-NEXT: vmerge.vxm v12, v11, a1, v0 +; CHECK-NEXT: lui a1, %hi(.LCPI65_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI65_0) +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v9, v11, 1, v0 +; CHECK-NEXT: vmv.v.i v11, 4 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vmerge.vim v8, v11, 1, v0 +; CHECK-NEXT: vle8.v v11, (a1) ; CHECK-NEXT: addi a1, a2, 78 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v10, v13, 1, v0 +; CHECK-NEXT: vsrl.vv v9, v10, v9 +; CHECK-NEXT: vmulhu.vv v9, v9, v11 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vsrl.vv v8, v9, v8 -; CHECK-NEXT: vmulhu.vv v8, v8, v11 -; CHECK-NEXT: vmerge.vim v10, v10, 3, v0 +; CHECK-NEXT: vmerge.vim v8, v8, 3, v0 ; CHECK-NEXT: lui a1, 8 ; CHECK-NEXT: addi a1, a1, 304 -; CHECK-NEXT: vsub.vv v9, v9, v8 -; CHECK-NEXT: vmulhu.vv v9, v9, v12 +; CHECK-NEXT: vsub.vv v10, v10, v9 +; CHECK-NEXT: vmulhu.vv v10, v10, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vmerge.vim v9, v10, 2, v0 -; CHECK-NEXT: vsrl.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v9, v10, v9 +; CHECK-NEXT: vmerge.vim v8, v8, 2, v0 +; CHECK-NEXT: vsrl.vv v8, v9, v8 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x @@ -1108,20 +1107,20 @@ define void @mulhu_v8i16(ptr %x) { ; CHECK-NEXT: addi a1, a1, %lo(.LCPI66_0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmv.v.i v11, 3 -; CHECK-NEXT: vle16.v v12, (a1) -; CHECK-NEXT: vmerge.vim v11, v11, 2, v0 -; CHECK-NEXT: vmv1r.v v13, v9 +; CHECK-NEXT: vmv1r.v v12, v9 ; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; CHECK-NEXT: vslideup.vi v9, v10, 6 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmerge.vim v11, v11, 2, v0 +; CHECK-NEXT: vle16.v v13, (a1) ; CHECK-NEXT: vsrl.vv v9, v8, v9 -; CHECK-NEXT: vmulhu.vv v9, v9, v12 +; CHECK-NEXT: vmulhu.vv v9, v9, v13 ; CHECK-NEXT: lui a1, 1048568 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma -; CHECK-NEXT: vmv.s.x v13, a1 +; CHECK-NEXT: vmv.s.x v12, a1 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: vmulhu.vv v8, v8, v13 +; CHECK-NEXT: vmulhu.vv v8, v8, v12 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; CHECK-NEXT: vslideup.vi v11, v10, 6 @@ -1162,13 +1161,13 @@ define void @mulhu_v4i32(ptr %x) { ; CHECK-NEXT: vmv.s.x v10, a1 ; CHECK-NEXT: lui a1, %hi(.LCPI68_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI68_0) -; CHECK-NEXT: vle32.v v11, (a1) ; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma ; CHECK-NEXT: vslideup.vi v9, v10, 2 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v10, (a1) ; CHECK-NEXT: lui a1, 4128 ; CHECK-NEXT: addi a1, a1, 514 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmulhu.vv v10, v8, v11 +; CHECK-NEXT: vmulhu.vv v10, v8, v10 ; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vmulhu.vv v8, v8, v9 ; CHECK-NEXT: vmv.s.x v9, a1 @@ -1206,8 +1205,6 @@ define void @mulhu_v2i64(ptr %x) { ; ; RV64-LABEL: mulhu_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: lui a1, 838861 ; RV64-NEXT: lui a2, 699051 ; RV64-NEXT: addiw a1, a1, -819 @@ -1216,6 +1213,8 @@ define void @mulhu_v2i64(ptr %x) { ; RV64-NEXT: add a1, a1, a3 ; RV64-NEXT: slli a3, a2, 32 ; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma ; RV64-NEXT: vmv.s.x v9, a2 @@ -1322,10 +1321,10 @@ define void @mulhs_v4i32(ptr %x) { ; ; RV64-LABEL: mulhs_v4i32: ; RV64: # %bb.0: -; RV64-NEXT: lui a1, %hi(.LCPI73_0) -; RV64-NEXT: ld a1, %lo(.LCPI73_0)(a1) ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: lui a1, %hi(.LCPI73_0) +; RV64-NEXT: ld a1, %lo(.LCPI73_0)(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -3153,27 +3152,27 @@ define void @mulhu_v32i8(ptr %x) { ; CHECK-LABEL: mulhu_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: lui a2, 163907 -; CHECK-NEXT: addi a2, a2, -2044 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v0, a2 -; CHECK-NEXT: lui a2, 66049 -; CHECK-NEXT: addi a2, a2, 32 -; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: li a2, -128 -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: lui a2, %hi(.LCPI181_0) +; CHECK-NEXT: addi a2, a2, %lo(.LCPI181_0) +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle8.v v10, (a2) +; CHECK-NEXT: lui a1, 163907 +; CHECK-NEXT: addi a1, a1, -2044 +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: lui a1, 66049 +; CHECK-NEXT: addi a1, a1, 32 +; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: li a1, -128 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: vmerge.vxm v10, v12, a2, v0 -; CHECK-NEXT: lui a1, %hi(.LCPI181_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI181_0) -; CHECK-NEXT: vle8.v v14, (a0) +; CHECK-NEXT: vmerge.vxm v14, v12, a1, v0 +; CHECK-NEXT: lui a1, 8208 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 -; CHECK-NEXT: vle8.v v12, (a1) -; CHECK-NEXT: lui a1, 8208 +; CHECK-NEXT: vle8.v v12, (a0) ; CHECK-NEXT: addi a1, a1, 513 -; CHECK-NEXT: vsrl.vv v8, v14, v8 -; CHECK-NEXT: vmulhu.vv v12, v8, v12 +; CHECK-NEXT: vsrl.vv v8, v12, v8 +; CHECK-NEXT: vmulhu.vv v10, v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: lui a1, 66785 @@ -3181,8 +3180,8 @@ define void @mulhu_v32i8(ptr %x) { ; CHECK-NEXT: vmv.s.x v8, a1 ; CHECK-NEXT: lui a1, 529160 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vsub.vv v14, v14, v12 -; CHECK-NEXT: vmulhu.vv v10, v14, v10 +; CHECK-NEXT: vsub.vv v12, v12, v10 +; CHECK-NEXT: vmulhu.vv v12, v12, v14 ; CHECK-NEXT: vmv.v.i v14, 4 ; CHECK-NEXT: addi a1, a1, 304 ; CHECK-NEXT: vmerge.vim v14, v14, 1, v0 @@ -3191,7 +3190,7 @@ define void @mulhu_v32i8(ptr %x) { ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vmerge.vim v14, v14, 3, v0 -; CHECK-NEXT: vadd.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v10, v12, v10 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vmerge.vim v8, v14, 2, v0 ; CHECK-NEXT: vsrl.vv v8, v10, v8 @@ -3291,11 +3290,11 @@ define void @mulhu_v8i32(ptr %x) { ; CHECK-NEXT: li a1, 68 ; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: lui a1, 524288 +; CHECK-NEXT: vmerge.vxm v10, v10, a1, v0 ; CHECK-NEXT: lui a1, %hi(.LCPI183_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI183_0) ; CHECK-NEXT: vle32.v v12, (a1) -; CHECK-NEXT: lui a1, 524288 -; CHECK-NEXT: vmerge.vxm v10, v10, a1, v0 ; CHECK-NEXT: lui a1, 4128 ; CHECK-NEXT: addi a1, a1, 514 ; CHECK-NEXT: vmulhu.vv v12, v8, v12 @@ -3450,10 +3449,10 @@ define void @mulhs_v8i32(ptr %x) { ; ; RV64-LABEL: mulhs_v8i32: ; RV64: # %bb.0: -; RV64-NEXT: lui a1, %hi(.LCPI187_0) -; RV64-NEXT: ld a1, %lo(.LCPI187_0)(a1) ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: lui a1, %hi(.LCPI187_0) +; RV64-NEXT: ld a1, %lo(.LCPI187_0)(a1) ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma @@ -3507,6 +3506,8 @@ define void @mulhs_v4i64(ptr %x) { ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-NEXT: vle64.v v8, (a0) ; RV64-NEXT: lui a1, 349525 +; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV64-NEXT: vmv.v.i v0, 5 ; RV64-NEXT: lui a2, 1044496 ; RV64-NEXT: addiw a1, a1, 1365 ; RV64-NEXT: addi a2, a2, -256 @@ -3514,12 +3515,10 @@ define void @mulhs_v4i64(ptr %x) { ; RV64-NEXT: slli a2, a1, 32 ; RV64-NEXT: add a1, a1, a2 ; RV64-NEXT: lui a2, %hi(.LCPI188_0) -; RV64-NEXT: ld a2, %lo(.LCPI188_0)(a2) -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV64-NEXT: vmv.v.i v0, 5 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-NEXT: vmv.v.x v12, a1 ; RV64-NEXT: li a1, 63 +; RV64-NEXT: ld a2, %lo(.LCPI188_0)(a2) ; RV64-NEXT: vmerge.vxm v12, v12, a2, v0 ; RV64-NEXT: lui a2, 4096 ; RV64-NEXT: addi a2, a2, 256 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll index b65352aed2d52..211c434c65743 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll @@ -7,55 +7,53 @@ define <4 x i1> @load_large_vector(ptr %p) { ; ZVE32X-LABEL: load_large_vector: ; ZVE32X: # %bb.0: -; ZVE32X-NEXT: ld a1, 0(a0) -; ZVE32X-NEXT: ld a2, 8(a0) -; ZVE32X-NEXT: ld a3, 24(a0) -; ZVE32X-NEXT: ld a4, 32(a0) -; ZVE32X-NEXT: ld a5, 48(a0) -; ZVE32X-NEXT: ld a6, 56(a0) -; ZVE32X-NEXT: ld a7, 72(a0) -; ZVE32X-NEXT: ld a0, 80(a0) +; ZVE32X-NEXT: ld a1, 48(a0) +; ZVE32X-NEXT: ld a2, 56(a0) +; ZVE32X-NEXT: ld a3, 72(a0) +; ZVE32X-NEXT: ld a4, 80(a0) +; ZVE32X-NEXT: ld a5, 0(a0) +; ZVE32X-NEXT: ld a6, 8(a0) +; ZVE32X-NEXT: ld a7, 24(a0) +; ZVE32X-NEXT: ld a0, 32(a0) ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; ZVE32X-NEXT: vmv.s.x v8, zero ; ZVE32X-NEXT: vmv.v.i v9, 0 -; ZVE32X-NEXT: xor a3, a3, a4 -; ZVE32X-NEXT: xor a1, a1, a2 -; ZVE32X-NEXT: xor a2, a5, a6 ; ZVE32X-NEXT: xor a0, a7, a0 -; ZVE32X-NEXT: snez a3, a3 +; ZVE32X-NEXT: xor a5, a5, a6 +; ZVE32X-NEXT: xor a1, a1, a2 +; ZVE32X-NEXT: xor a3, a3, a4 +; ZVE32X-NEXT: snez a0, a0 +; ZVE32X-NEXT: snez a2, a5 ; ZVE32X-NEXT: snez a1, a1 -; ZVE32X-NEXT: vmv.s.x v10, a3 -; ZVE32X-NEXT: vmv.s.x v11, a1 +; ZVE32X-NEXT: snez a3, a3 +; ZVE32X-NEXT: vmv.s.x v10, a0 +; ZVE32X-NEXT: vmv.s.x v11, a2 ; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; ZVE32X-NEXT: vand.vi v10, v10, 1 +; ZVE32X-NEXT: vand.vi v11, v11, 1 ; ZVE32X-NEXT: vmsne.vi v0, v10, 0 -; ZVE32X-NEXT: vand.vi v10, v11, 1 -; ZVE32X-NEXT: vmerge.vim v11, v8, 1, v0 -; ZVE32X-NEXT: vmsne.vi v0, v10, 0 -; ZVE32X-NEXT: snez a1, a2 +; ZVE32X-NEXT: vmerge.vim v10, v8, 1, v0 +; ZVE32X-NEXT: vmsne.vi v0, v11, 0 ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; ZVE32X-NEXT: vmerge.vim v10, v9, 1, v0 +; ZVE32X-NEXT: vmerge.vim v11, v9, 1, v0 ; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, tu, ma -; ZVE32X-NEXT: vslideup.vi v10, v11, 1 -; ZVE32X-NEXT: vmv.s.x v11, a1 -; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; ZVE32X-NEXT: vmsne.vi v0, v10, 0 +; ZVE32X-NEXT: vslideup.vi v11, v10, 1 +; ZVE32X-NEXT: vmv.s.x v10, a1 ; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; ZVE32X-NEXT: vand.vi v10, v11, 1 +; ZVE32X-NEXT: vand.vi v10, v10, 1 ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vmsne.vi v0, v11, 0 ; ZVE32X-NEXT: vmerge.vim v11, v9, 1, v0 ; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; ZVE32X-NEXT: vmsne.vi v0, v10, 0 -; ZVE32X-NEXT: snez a0, a0 ; ZVE32X-NEXT: vmerge.vim v10, v8, 1, v0 ; ZVE32X-NEXT: vsetivli zero, 3, e8, mf4, tu, ma ; ZVE32X-NEXT: vslideup.vi v11, v10, 2 -; ZVE32X-NEXT: vmv.s.x v10, a0 -; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; ZVE32X-NEXT: vmsne.vi v0, v11, 0 +; ZVE32X-NEXT: vmv.s.x v10, a3 ; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; ZVE32X-NEXT: vand.vi v10, v10, 1 ; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vmsne.vi v0, v11, 0 ; ZVE32X-NEXT: vmerge.vim v9, v9, 1, v0 ; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; ZVE32X-NEXT: vmsne.vi v0, v10, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 1516c67bf7ecc..e1f834b263782 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -183,10 +183,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 88 +; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 88 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 84 * vlenb ; RV32-NEXT: addi a3, a1, 256 ; RV32-NEXT: addi a4, a1, 128 ; RV32-NEXT: li a2, 32 @@ -194,79 +194,127 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a6, %hi(.LCPI8_0) ; RV32-NEXT: addi a6, a6, %lo(.LCPI8_0) ; RV32-NEXT: li a7, 768 -; RV32-NEXT: lui t0, 49164 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: lui a1, 49164 +; RV32-NEXT: vle32.v v24, (a4) +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li t0, 60 +; RV32-NEXT: mul a4, a4, t0 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, %hi(.LCPI8_1) +; RV32-NEXT: addi a4, a4, %lo(.LCPI8_1) +; RV32-NEXT: addi a5, a5, 3 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vle16.v v8, (a6) +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li t0, 76 +; RV32-NEXT: mul a6, a6, t0 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs2r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v8, a7 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li a7, 36 +; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs1r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, a1, 12 +; RV32-NEXT: vle16.v v8, (a4) +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li a6, 28 +; RV32-NEXT: mul a4, a4, a6 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs2r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v0, a5 +; RV32-NEXT: vmv.s.x v8, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li t1, 72 -; RV32-NEXT: mul a1, a1, t1 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vle32.v v8, (a4) +; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv8r.v v8, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a4, 68 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a5, a5, 3 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle16.v v6, (a6) -; RV32-NEXT: vmv.s.x v0, a5 -; RV32-NEXT: lui a1, %hi(.LCPI8_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_1) +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 +; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 76 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl2r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v24, v16, v6 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vrgatherei16.vv v0, v16, v24 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 44 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v16, (a3) -; RV32-NEXT: addi t0, t0, 12 -; RV32-NEXT: vmv.s.x v0, a7 -; RV32-NEXT: vmv.s.x v7, t0 -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v4, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 76 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 52 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v20, v24, v16, v0 +; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 36 +; RV32-NEXT: li a3, 20 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v8, v16, v0 +; RV32-NEXT: vmerge.vvm v24, v16, v8, v0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 28 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v24, v4 +; RV32-NEXT: vrgatherei16.vv v8, v24, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 40 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -278,23 +326,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: slli a1, a1, 10 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vle16.v v14, (a4) ; RV32-NEXT: vmv.s.x v12, a3 +; RV32-NEXT: vle16.v v14, (a4) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv4r.v v8, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 52 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV32-NEXT: vmerge.vvm v8, v16, v24, v0 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 @@ -303,7 +350,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 68 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 60 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -323,326 +377,312 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a5, 768 ; RV32-NEXT: li a6, 48 ; RV32-NEXT: lui a7, 3073 -; RV32-NEXT: li t0, 192 ; RV32-NEXT: addi a1, a1, 3 ; RV32-NEXT: addi a3, a3, 192 ; RV32-NEXT: addi a4, a4, 12 ; RV32-NEXT: addi a5, a5, 768 ; RV32-NEXT: addi a7, a7, -1024 -; RV32-NEXT: vmv.s.x v13, a6 -; RV32-NEXT: vmv.s.x v2, t0 +; RV32-NEXT: vmv.s.x v2, a6 ; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vmv.s.x v12, a3 -; RV32-NEXT: vmv.s.x v3, a4 -; RV32-NEXT: vmv.s.x v14, a5 -; RV32-NEXT: vmv.s.x v1, a7 +; RV32-NEXT: vmv.s.x v8, a3 +; RV32-NEXT: vmv.s.x v20, a4 +; RV32-NEXT: vmv.s.x v1, a5 +; RV32-NEXT: vmv.s.x v3, a7 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 52 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv4r.v v8, v16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv4r.v v16, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v20, v8, v16, v0 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vmerge.vvm v4, v16, v24, v0 +; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 60 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v16, v24, v0 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vmerge.vvm v24, v8, v24, v0 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vmv1r.v v0, v20 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v4, v8, v24, v0 -; RV32-NEXT: vmv1r.v v0, v14 +; RV32-NEXT: vmerge.vvm v20, v16, v24, v0 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v16, v24, v0 +; RV32-NEXT: vmerge.vvm v24, v8, v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: li a3, 12 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v13 +; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v12, v8, v24, v0 +; RV32-NEXT: vmerge.vvm v12, v16, v8, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 20 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 60 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV32-NEXT: vmerge.vvm v24, v8, v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 72 +; RV32-NEXT: li a2, 68 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v2 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: li a1, 192 +; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 80 +; RV32-NEXT: li a2, 76 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: vmerge.vvm v8, v16, v8, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a2, 60 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 32 ; RV32-NEXT: addi a1, a1, 4 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 36 +; RV32-NEXT: li a2, 20 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v8, v12 +; RV32-NEXT: vrgatherei16.vv v8, v12, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 44 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v8, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 80 +; RV32-NEXT: li a2, 76 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 48 -; RV32-NEXT: lui a2, %hi(.LCPI8_3) -; RV32-NEXT: addi a2, a2, %lo(.LCPI8_3) ; RV32-NEXT: addi a1, a1, 5 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v25, a1 +; RV32-NEXT: vmv.v.x v3, a1 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v8, v25 +; RV32-NEXT: vrgatherei16.vv v8, v12, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 36 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v8, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 36 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI8_3) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_3) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v8, v24 +; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vrgatherei16.vv v12, v4, v24 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 28 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v12, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 52 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI8_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_4) -; RV32-NEXT: lui a2, %hi(.LCPI8_5) -; RV32-NEXT: addi a2, a2, %lo(.LCPI8_5) -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v26, (a1) -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v24, (a2) -; RV32-NEXT: lui a1, %hi(.LCPI8_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle16.v v2, (a1) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 12 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v26 +; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v0, v12 +; RV32-NEXT: lui a1, %hi(.LCPI8_5) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_5) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v20, v4, v24 +; RV32-NEXT: vle16.v v28, (a1) +; RV32-NEXT: vrgatherei16.vv v8, v20, v28 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v8 +; RV32-NEXT: vmv.v.v v8, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: li a2, 28 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI8_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v24, v8, v2 +; RV32-NEXT: vle16.v v24, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 12 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v0, v16, v24 ; RV32-NEXT: lui a1, %hi(.LCPI8_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7) -; RV32-NEXT: lui a2, %hi(.LCPI8_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI8_8) -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v12, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI8_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9) -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v16, (a2) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vle16.v v18, (a1) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 20 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v0, v12 +; RV32-NEXT: vle16.v v20, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v16, v20 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v24, v0 +; RV32-NEXT: lui a1, %hi(.LCPI8_8) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_8) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle16.v v16, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 72 +; RV32-NEXT: li a2, 68 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v8, v0, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 44 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI8_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9) +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vle16.v v16, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a2, 60 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v4, v18 +; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v20, v28, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 44 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v8, v0 +; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 256 ; RV32-NEXT: vse32.v v24, (a1) ; RV32-NEXT: addi a1, a0, 192 -; RV32-NEXT: vse32.v v20, (a1) -; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 40 +; RV32-NEXT: li a3, 28 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 52 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl4r.v v12, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vse32.v v12, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 80 +; RV32-NEXT: li a2, 76 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 88 +; RV32-NEXT: li a1, 84 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 @@ -659,463 +699,419 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: sub sp, sp, a2 ; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 88 * vlenb -; RV64-NEXT: addi a3, a1, 128 -; RV64-NEXT: addi a6, a1, 256 -; RV64-NEXT: li a4, 128 -; RV64-NEXT: lui a2, 1 -; RV64-NEXT: lui a5, %hi(.LCPI8_0) -; RV64-NEXT: addi a5, a5, %lo(.LCPI8_0) +; RV64-NEXT: addi a3, a1, 256 +; RV64-NEXT: li a2, 128 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmv.v.i v16, 6 +; RV64-NEXT: lui a4, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v8, (a6) -; RV64-NEXT: lui a6, 16 -; RV64-NEXT: addi a6, a6, 7 +; RV64-NEXT: vle64.v v8, (a3) +; RV64-NEXT: addi a4, a4, 7 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v17, a6 -; RV64-NEXT: addi a6, a2, 65 +; RV64-NEXT: vmv.v.x v17, a4 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vrgather.vi v4, v8, 4 +; RV64-NEXT: vrgather.vi v24, v8, 4 ; RV64-NEXT: vrgather.vi v20, v8, 5 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 84 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 76 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs4r.v v20, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vrgatherei16.vv v20, v8, v16 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: slli a7, a7, 6 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 84 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs4r.v v20, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vrgatherei16.vv v20, v8, v17 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 56 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 80 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs4r.v v20, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vrgather.vi v16, v8, 2 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 72 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v16, (a7) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 72 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs4r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vrgather.vi v16, v8, 3 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 68 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v16, (a7) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 6 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs4r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 8 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 40 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v0, a4 -; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: slli a4, a4, 5 -; RV64-NEXT: add a4, sp, a4 -; RV64-NEXT: addi a4, a4, 16 -; RV64-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 48 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v7, a2 +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs1r.v v7, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vrgather.vi v24, v8, 2, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 68 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v24, (a2) # Unknown-size Folded Spill +; RV64-NEXT: addi a3, a1, 128 +; RV64-NEXT: lui a2, 1 +; RV64-NEXT: lui a4, %hi(.LCPI8_0) +; RV64-NEXT: addi a4, a4, %lo(.LCPI8_0) +; RV64-NEXT: addi a5, a2, 65 +; RV64-NEXT: vmv.s.x v0, a5 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle16.v v2, (a4) +; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a4, 56 +; RV64-NEXT: mul a1, a1, a4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vle64.v v16, (a3) +; RV64-NEXT: vmerge.vvm v24, v16, v8, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 +; RV64-NEXT: li a3, 40 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vle16.v v12, (a5) +; RV64-NEXT: vrgatherei16.vv v8, v24, v2 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: slli a1, a1, 5 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs2r.v v12, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v2, a6 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v4, v8, 2, v0.t +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 60 +; RV64-NEXT: li a3, 76 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v2 -; RV64-NEXT: vmv8r.v v8, v24 +; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a3, 48 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vmerge.vvm v24, v16, v24, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v0, v24, v16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vrgather.vi v24, v8, 3, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 24 +; RV64-NEXT: li a3, 76 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill ; RV64-NEXT: lui a1, 2 ; RV64-NEXT: lui a3, %hi(.LCPI8_1) ; RV64-NEXT: addi a3, a3, %lo(.LCPI8_1) ; RV64-NEXT: addi a1, a1, 130 -; RV64-NEXT: vle16.v v16, (a3) -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 3 -; RV64-NEXT: add a3, sp, a3 -; RV64-NEXT: addi a3, a3, 16 -; RV64-NEXT: vs2r.v v16, (a3) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v2, a1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 84 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 40 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v24, v16, 3, v0.t +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle16.v v8, (a3) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 84 -; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v2 +; RV64-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 +; RV64-NEXT: li a3, 56 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vmerge.vvm v24, v16, v8, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v0, v24, v8 +; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v0, v24, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill ; RV64-NEXT: lui a1, 4 -; RV64-NEXT: lui a3, 8 ; RV64-NEXT: addi a1, a1, 260 -; RV64-NEXT: addi a3, a3, 520 ; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: vmv.s.x v2, a3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 48 +; RV64-NEXT: li a3, 40 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmerge.vvm v8, v16, v24, v0 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmerge.vvm v24, v16, v8, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vl1r.v v7, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: li a3, 84 +; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 40 +; RV64-NEXT: li a3, 48 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv4r.v v8, v16 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v12, v16, 4, v0.t +; RV64-NEXT: vrgather.vi v24, v16, 4, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: li a3, 84 +; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v2 +; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, 8 +; RV64-NEXT: addi a1, a1, 520 +; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 +; RV64-NEXT: li a3, 40 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 +; RV64-NEXT: vmerge.vvm v24, v16, v8, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vmv1r.v v0, v7 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 56 +; RV64-NEXT: li a3, 80 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a3, 48 +; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vrgather.vi v24, v8, 5, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 56 +; RV64-NEXT: li a3, 80 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill ; RV64-NEXT: lui a1, 96 ; RV64-NEXT: li a3, 192 -; RV64-NEXT: vmv.s.x v3, a3 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v24, a1 -; RV64-NEXT: vmv1r.v v0, v3 +; RV64-NEXT: vmv.v.x v2, a1 +; RV64-NEXT: vmv.s.x v3, a3 +; RV64-NEXT: vmv.v.v v0, v3 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a3, 72 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v28, v8, v24, v0.t -; RV64-NEXT: vmv4r.v v16, v8 +; RV64-NEXT: vrgatherei16.vv v24, v8, v2, v0.t ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a3, 72 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI8_2) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_2) -; RV64-NEXT: li a3, 1040 -; RV64-NEXT: lui a4, 112 -; RV64-NEXT: addi a4, a4, 1 -; RV64-NEXT: vmv.s.x v0, a3 +; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: li a1, 1040 +; RV64-NEXT: lui a3, 112 +; RV64-NEXT: addi a3, a3, 1 +; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v5, a4 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle16.v v6, (a1) +; RV64-NEXT: vmv.v.x v12, a3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 48 +; RV64-NEXT: li a3, 56 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmerge.vvm v24, v8, v24, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vmerge.vvm v24, v16, v24, v0 +; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vmv1r.v v0, v3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 68 -; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: slli a1, a1, 6 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v28, v16, v5, v0.t +; RV64-NEXT: vrgatherei16.vv v24, v8, v12, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 68 -; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: slli a1, a1, 6 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill -; RV64-NEXT: addi a1, a2, -2016 -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, %hi(.LCPI8_2) +; RV64-NEXT: addi a1, a1, %lo(.LCPI8_2) ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v16, v24, v6 +; RV64-NEXT: vle16.v v24, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 40 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v0, v8, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 48 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: li a3, 48 +; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, a2, -2016 +; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 76 +; RV64-NEXT: li a2, 56 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vmerge.vvm v8, v16, v8, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 76 +; RV64-NEXT: li a2, 56 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI8_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_3) -; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 60 +; RV64-NEXT: li a2, 68 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 24 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a1, a1, 5 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v12, v0 +; RV64-NEXT: vmv.v.v v16, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 60 +; RV64-NEXT: li a2, 68 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 84 +; RV64-NEXT: li a2, 76 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: li a2, 24 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv.v.v v12, v24 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.v v20, v8 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 84 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 40 +; RV64-NEXT: li a2, 48 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v16, v24 +; RV64-NEXT: vmv.v.v v8, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: li a2, 84 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, %hi(.LCPI8_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI8_3) ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v0, v24, v8 -; RV64-NEXT: lui a1, %hi(.LCPI8_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_4) -; RV64-NEXT: vle16.v v8, (a1) -; RV64-NEXT: lui a1, %hi(.LCPI8_5) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_5) -; RV64-NEXT: vle16.v v10, (a1) +; RV64-NEXT: vle16.v v16, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs2r.v v10, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v24, v8, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 56 +; RV64-NEXT: li a2, 80 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v12, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.v v8, v24 +; RV64-NEXT: lui a1, %hi(.LCPI8_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI8_4) ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v24, v0, v8 +; RV64-NEXT: vle16.v v16, (a1) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v24, v0, v16 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 72 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v8, v24 +; RV64-NEXT: vmv.v.v v12, v24 +; RV64-NEXT: lui a1, %hi(.LCPI8_5) +; RV64-NEXT: addi a1, a1, %lo(.LCPI8_5) +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle16.v v16, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 76 +; RV64-NEXT: li a2, 56 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v24, v0, v16 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 6 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl2r.v v20, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v24, v0, v20 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 68 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v28, v24 +; RV64-NEXT: vmv.v.v v16, v24 ; RV64-NEXT: addi a1, a0, 256 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: vse64.v v12, (a1) ; RV64-NEXT: addi a1, a0, 320 -; RV64-NEXT: vse64.v v28, (a1) +; RV64-NEXT: vse64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 192 -; RV64-NEXT: vse64.v v12, (a1) +; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 128 -; RV64-NEXT: vse64.v v16, (a1) -; RV64-NEXT: addi a1, a0, 64 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: li a3, 84 ; RV64-NEXT: mul a2, a2, a3 @@ -1123,8 +1119,10 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: addi a1, a0, 64 +; RV64-NEXT: vse64.v v20, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 60 +; RV64-NEXT: li a2, 68 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll index f27614c93985f..118408d40c669 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll @@ -612,50 +612,51 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) { ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vfmv.f.s fa5, v8 +; RV64-NEXT: vslidedown.vi v12, v8, 3 +; RV64-NEXT: vslidedown.vi v13, v8, 2 +; RV64-NEXT: vslidedown.vi v14, v8, 1 +; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-NEXT: vslidedown.vi v16, v8, 7 +; RV64-NEXT: vslidedown.vi v18, v8, 6 +; RV64-NEXT: vslidedown.vi v20, v8, 5 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: flw fa5, 124(sp) -; RV64-NEXT: vfmv.f.s fa4, v8 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-NEXT: vslidedown.vi v11, v8, 2 +; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-NEXT: vslidedown.vi v8, v8, 4 ; RV64-NEXT: fcvt.l.s a0, fa5 -; RV64-NEXT: sd a0, 248(sp) -; RV64-NEXT: flw fa5, 120(sp) -; RV64-NEXT: vslidedown.vi v12, v8, 1 -; RV64-NEXT: fcvt.l.s a0, fa4 -; RV64-NEXT: vfmv.f.s fa4, v10 +; RV64-NEXT: vfmv.f.s fa5, v12 ; RV64-NEXT: fcvt.l.s a1, fa5 -; RV64-NEXT: sd a1, 240(sp) -; RV64-NEXT: flw fa5, 116(sp) -; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vi v14, v8, 7 -; RV64-NEXT: fcvt.l.s a1, fa4 -; RV64-NEXT: vfmv.f.s fa4, v11 +; RV64-NEXT: vfmv.f.s fa5, v13 ; RV64-NEXT: fcvt.l.s a2, fa5 -; RV64-NEXT: sd a2, 232(sp) -; RV64-NEXT: flw fa5, 112(sp) -; RV64-NEXT: fcvt.l.s a2, fa4 -; RV64-NEXT: vfmv.f.s fa4, v12 -; RV64-NEXT: vslidedown.vi v10, v8, 6 +; RV64-NEXT: vfmv.f.s fa5, v14 ; RV64-NEXT: fcvt.l.s a3, fa5 -; RV64-NEXT: sd a3, 224(sp) -; RV64-NEXT: flw fa5, 108(sp) -; RV64-NEXT: fcvt.l.s a3, fa4 -; RV64-NEXT: vfmv.f.s fa4, v14 -; RV64-NEXT: vslidedown.vi v12, v8, 5 +; RV64-NEXT: vfmv.f.s fa5, v16 ; RV64-NEXT: fcvt.l.s a4, fa5 -; RV64-NEXT: sd a4, 216(sp) -; RV64-NEXT: flw fa5, 104(sp) -; RV64-NEXT: fcvt.l.s a4, fa4 -; RV64-NEXT: vfmv.f.s fa4, v10 -; RV64-NEXT: fcvt.l.s a5, fa4 +; RV64-NEXT: vfmv.f.s fa5, v18 +; RV64-NEXT: fcvt.l.s a5, fa5 +; RV64-NEXT: vfmv.f.s fa5, v20 ; RV64-NEXT: fcvt.l.s a6, fa5 -; RV64-NEXT: sd a6, 208(sp) +; RV64-NEXT: flw fa5, 124(sp) +; RV64-NEXT: fcvt.l.s a7, fa5 +; RV64-NEXT: sd a7, 248(sp) +; RV64-NEXT: flw fa5, 120(sp) +; RV64-NEXT: fcvt.l.s a7, fa5 +; RV64-NEXT: sd a7, 240(sp) +; RV64-NEXT: flw fa5, 116(sp) +; RV64-NEXT: fcvt.l.s a7, fa5 +; RV64-NEXT: sd a7, 232(sp) +; RV64-NEXT: flw fa5, 112(sp) +; RV64-NEXT: fcvt.l.s a7, fa5 +; RV64-NEXT: sd a7, 224(sp) +; RV64-NEXT: flw fa5, 108(sp) +; RV64-NEXT: fcvt.l.s a7, fa5 +; RV64-NEXT: sd a7, 216(sp) +; RV64-NEXT: flw fa5, 104(sp) +; RV64-NEXT: fcvt.l.s a7, fa5 +; RV64-NEXT: sd a7, 208(sp) ; RV64-NEXT: flw fa5, 100(sp) -; RV64-NEXT: vfmv.f.s fa4, v12 -; RV64-NEXT: fcvt.l.s a6, fa4 -; RV64-NEXT: vslidedown.vi v8, v8, 4 ; RV64-NEXT: fcvt.l.s a7, fa5 ; RV64-NEXT: vfmv.f.s fa5, v8 ; RV64-NEXT: sd a7, 200(sp) @@ -981,26 +982,27 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) { ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: mv a0, sp +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vfmv.f.s fa5, v8 +; RV64-NEXT: vslidedown.vi v12, v8, 1 +; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma +; RV64-NEXT: vslidedown.vi v14, v8, 3 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: fld fa5, 56(sp) -; RV64-NEXT: vfmv.f.s fa4, v8 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-NEXT: fcvt.l.d a0, fa4 -; RV64-NEXT: fcvt.l.d a1, fa5 -; RV64-NEXT: sd a1, 120(sp) -; RV64-NEXT: fld fa5, 48(sp) -; RV64-NEXT: vfmv.f.s fa4, v10 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-NEXT: fcvt.l.d a1, fa4 +; RV64-NEXT: vslidedown.vi v8, v8, 2 +; RV64-NEXT: fcvt.l.d a0, fa5 +; RV64-NEXT: vfmv.f.s fa5, v12 +; RV64-NEXT: fcvt.l.d a1, fa5 +; RV64-NEXT: vfmv.f.s fa5, v14 ; RV64-NEXT: fcvt.l.d a2, fa5 -; RV64-NEXT: sd a2, 112(sp) +; RV64-NEXT: fld fa5, 56(sp) +; RV64-NEXT: fcvt.l.d a3, fa5 +; RV64-NEXT: sd a3, 120(sp) +; RV64-NEXT: fld fa5, 48(sp) +; RV64-NEXT: fcvt.l.d a3, fa5 +; RV64-NEXT: sd a3, 112(sp) ; RV64-NEXT: fld fa5, 40(sp) -; RV64-NEXT: vfmv.f.s fa4, v10 -; RV64-NEXT: fcvt.l.d a2, fa4 -; RV64-NEXT: vslidedown.vi v8, v8, 2 ; RV64-NEXT: fcvt.l.d a3, fa5 ; RV64-NEXT: vfmv.f.s fa5, v8 ; RV64-NEXT: sd a3, 104(sp) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll index 2f58e3dd2769f..23ecc74880c6a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll @@ -363,50 +363,51 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) { ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: mv a0, sp +; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vfmv.f.s fa5, v8 +; RV32-NEXT: vslidedown.vi v12, v8, 3 +; RV32-NEXT: vslidedown.vi v13, v8, 2 +; RV32-NEXT: vslidedown.vi v14, v8, 1 +; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV32-NEXT: vslidedown.vi v16, v8, 7 +; RV32-NEXT: vslidedown.vi v18, v8, 6 +; RV32-NEXT: vslidedown.vi v20, v8, 5 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: flw fa5, 60(sp) -; RV32-NEXT: vfmv.f.s fa4, v8 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 3 -; RV32-NEXT: vslidedown.vi v11, v8, 2 +; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 4 ; RV32-NEXT: fcvt.w.s a0, fa5 -; RV32-NEXT: sw a0, 124(sp) -; RV32-NEXT: flw fa5, 56(sp) -; RV32-NEXT: fcvt.w.s a0, fa4 -; RV32-NEXT: vfmv.f.s fa4, v10 -; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: vfmv.f.s fa5, v12 ; RV32-NEXT: fcvt.w.s a1, fa5 -; RV32-NEXT: sw a1, 120(sp) -; RV32-NEXT: flw fa5, 52(sp) -; RV32-NEXT: fcvt.w.s a1, fa4 -; RV32-NEXT: vfmv.f.s fa4, v11 -; RV32-NEXT: fcvt.w.s a2, fa4 +; RV32-NEXT: vfmv.f.s fa5, v13 +; RV32-NEXT: fcvt.w.s a2, fa5 +; RV32-NEXT: vfmv.f.s fa5, v14 ; RV32-NEXT: fcvt.w.s a3, fa5 -; RV32-NEXT: sw a3, 116(sp) -; RV32-NEXT: flw fa5, 48(sp) -; RV32-NEXT: vfmv.f.s fa4, v10 -; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 7 -; RV32-NEXT: fcvt.w.s a3, fa4 +; RV32-NEXT: vfmv.f.s fa5, v16 ; RV32-NEXT: fcvt.w.s a4, fa5 -; RV32-NEXT: sw a4, 112(sp) -; RV32-NEXT: flw fa5, 44(sp) -; RV32-NEXT: vfmv.f.s fa4, v10 -; RV32-NEXT: vslidedown.vi v10, v8, 6 -; RV32-NEXT: fcvt.w.s a4, fa4 +; RV32-NEXT: vfmv.f.s fa5, v18 ; RV32-NEXT: fcvt.w.s a5, fa5 -; RV32-NEXT: sw a5, 108(sp) -; RV32-NEXT: flw fa5, 40(sp) -; RV32-NEXT: vfmv.f.s fa4, v10 -; RV32-NEXT: vslidedown.vi v10, v8, 5 -; RV32-NEXT: fcvt.w.s a5, fa4 +; RV32-NEXT: vfmv.f.s fa5, v20 ; RV32-NEXT: fcvt.w.s a6, fa5 -; RV32-NEXT: sw a6, 104(sp) +; RV32-NEXT: flw fa5, 60(sp) +; RV32-NEXT: fcvt.w.s a7, fa5 +; RV32-NEXT: sw a7, 124(sp) +; RV32-NEXT: flw fa5, 56(sp) +; RV32-NEXT: fcvt.w.s a7, fa5 +; RV32-NEXT: sw a7, 120(sp) +; RV32-NEXT: flw fa5, 52(sp) +; RV32-NEXT: fcvt.w.s a7, fa5 +; RV32-NEXT: sw a7, 116(sp) +; RV32-NEXT: flw fa5, 48(sp) +; RV32-NEXT: fcvt.w.s a7, fa5 +; RV32-NEXT: sw a7, 112(sp) +; RV32-NEXT: flw fa5, 44(sp) +; RV32-NEXT: fcvt.w.s a7, fa5 +; RV32-NEXT: sw a7, 108(sp) +; RV32-NEXT: flw fa5, 40(sp) +; RV32-NEXT: fcvt.w.s a7, fa5 +; RV32-NEXT: sw a7, 104(sp) ; RV32-NEXT: flw fa5, 36(sp) -; RV32-NEXT: vfmv.f.s fa4, v10 -; RV32-NEXT: fcvt.w.s a6, fa4 -; RV32-NEXT: vslidedown.vi v8, v8, 4 ; RV32-NEXT: fcvt.w.s a7, fa5 ; RV32-NEXT: vfmv.f.s fa5, v8 ; RV32-NEXT: sw a7, 100(sp) @@ -447,50 +448,51 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) { ; RV64-i32-NEXT: .cfi_def_cfa s0, 0 ; RV64-i32-NEXT: andi sp, sp, -64 ; RV64-i32-NEXT: mv a0, sp +; RV64-i32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-i32-NEXT: vfmv.f.s fa5, v8 +; RV64-i32-NEXT: vslidedown.vi v12, v8, 3 +; RV64-i32-NEXT: vslidedown.vi v13, v8, 2 +; RV64-i32-NEXT: vslidedown.vi v14, v8, 1 +; RV64-i32-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-i32-NEXT: vslidedown.vi v16, v8, 7 +; RV64-i32-NEXT: vslidedown.vi v18, v8, 6 +; RV64-i32-NEXT: vslidedown.vi v20, v8, 5 ; RV64-i32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-i32-NEXT: vse32.v v8, (a0) -; RV64-i32-NEXT: flw fa5, 60(sp) -; RV64-i32-NEXT: vfmv.f.s fa4, v8 -; RV64-i32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-i32-NEXT: vslidedown.vi v10, v8, 3 -; RV64-i32-NEXT: vslidedown.vi v11, v8, 2 +; RV64-i32-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-i32-NEXT: vslidedown.vi v8, v8, 4 ; RV64-i32-NEXT: fcvt.l.s a0, fa5 -; RV64-i32-NEXT: sw a0, 124(sp) -; RV64-i32-NEXT: flw fa5, 56(sp) -; RV64-i32-NEXT: fcvt.l.s a0, fa4 -; RV64-i32-NEXT: vfmv.f.s fa4, v10 -; RV64-i32-NEXT: vslidedown.vi v10, v8, 1 +; RV64-i32-NEXT: vfmv.f.s fa5, v12 ; RV64-i32-NEXT: fcvt.l.s a1, fa5 -; RV64-i32-NEXT: sw a1, 120(sp) -; RV64-i32-NEXT: flw fa5, 52(sp) -; RV64-i32-NEXT: fcvt.l.s a1, fa4 -; RV64-i32-NEXT: vfmv.f.s fa4, v11 -; RV64-i32-NEXT: fcvt.l.s a2, fa4 +; RV64-i32-NEXT: vfmv.f.s fa5, v13 +; RV64-i32-NEXT: fcvt.l.s a2, fa5 +; RV64-i32-NEXT: vfmv.f.s fa5, v14 ; RV64-i32-NEXT: fcvt.l.s a3, fa5 -; RV64-i32-NEXT: sw a3, 116(sp) -; RV64-i32-NEXT: flw fa5, 48(sp) -; RV64-i32-NEXT: vfmv.f.s fa4, v10 -; RV64-i32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-i32-NEXT: vslidedown.vi v10, v8, 7 -; RV64-i32-NEXT: fcvt.l.s a3, fa4 +; RV64-i32-NEXT: vfmv.f.s fa5, v16 ; RV64-i32-NEXT: fcvt.l.s a4, fa5 -; RV64-i32-NEXT: sw a4, 112(sp) -; RV64-i32-NEXT: flw fa5, 44(sp) -; RV64-i32-NEXT: vfmv.f.s fa4, v10 -; RV64-i32-NEXT: vslidedown.vi v10, v8, 6 -; RV64-i32-NEXT: fcvt.l.s a4, fa4 +; RV64-i32-NEXT: vfmv.f.s fa5, v18 ; RV64-i32-NEXT: fcvt.l.s a5, fa5 -; RV64-i32-NEXT: sw a5, 108(sp) -; RV64-i32-NEXT: flw fa5, 40(sp) -; RV64-i32-NEXT: vfmv.f.s fa4, v10 -; RV64-i32-NEXT: vslidedown.vi v10, v8, 5 -; RV64-i32-NEXT: fcvt.l.s a5, fa4 +; RV64-i32-NEXT: vfmv.f.s fa5, v20 ; RV64-i32-NEXT: fcvt.l.s a6, fa5 -; RV64-i32-NEXT: sw a6, 104(sp) +; RV64-i32-NEXT: flw fa5, 60(sp) +; RV64-i32-NEXT: fcvt.l.s a7, fa5 +; RV64-i32-NEXT: sw a7, 124(sp) +; RV64-i32-NEXT: flw fa5, 56(sp) +; RV64-i32-NEXT: fcvt.l.s a7, fa5 +; RV64-i32-NEXT: sw a7, 120(sp) +; RV64-i32-NEXT: flw fa5, 52(sp) +; RV64-i32-NEXT: fcvt.l.s a7, fa5 +; RV64-i32-NEXT: sw a7, 116(sp) +; RV64-i32-NEXT: flw fa5, 48(sp) +; RV64-i32-NEXT: fcvt.l.s a7, fa5 +; RV64-i32-NEXT: sw a7, 112(sp) +; RV64-i32-NEXT: flw fa5, 44(sp) +; RV64-i32-NEXT: fcvt.l.s a7, fa5 +; RV64-i32-NEXT: sw a7, 108(sp) +; RV64-i32-NEXT: flw fa5, 40(sp) +; RV64-i32-NEXT: fcvt.l.s a7, fa5 +; RV64-i32-NEXT: sw a7, 104(sp) ; RV64-i32-NEXT: flw fa5, 36(sp) -; RV64-i32-NEXT: vfmv.f.s fa4, v10 -; RV64-i32-NEXT: fcvt.l.s a6, fa4 -; RV64-i32-NEXT: vslidedown.vi v8, v8, 4 ; RV64-i32-NEXT: fcvt.l.s a7, fa5 ; RV64-i32-NEXT: vfmv.f.s fa5, v8 ; RV64-i32-NEXT: sw a7, 100(sp) @@ -531,50 +533,51 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) { ; RV64-i64-NEXT: .cfi_def_cfa s0, 0 ; RV64-i64-NEXT: andi sp, sp, -128 ; RV64-i64-NEXT: addi a0, sp, 64 +; RV64-i64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-i64-NEXT: vfmv.f.s fa5, v8 +; RV64-i64-NEXT: vslidedown.vi v12, v8, 3 +; RV64-i64-NEXT: vslidedown.vi v13, v8, 2 +; RV64-i64-NEXT: vslidedown.vi v14, v8, 1 +; RV64-i64-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-i64-NEXT: vslidedown.vi v16, v8, 7 +; RV64-i64-NEXT: vslidedown.vi v18, v8, 6 +; RV64-i64-NEXT: vslidedown.vi v20, v8, 5 ; RV64-i64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-i64-NEXT: vse32.v v8, (a0) -; RV64-i64-NEXT: flw fa5, 124(sp) -; RV64-i64-NEXT: vfmv.f.s fa4, v8 -; RV64-i64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-i64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-i64-NEXT: vslidedown.vi v11, v8, 2 +; RV64-i64-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64-i64-NEXT: vslidedown.vi v8, v8, 4 ; RV64-i64-NEXT: fcvt.l.s a0, fa5 -; RV64-i64-NEXT: sd a0, 248(sp) -; RV64-i64-NEXT: flw fa5, 120(sp) -; RV64-i64-NEXT: vslidedown.vi v12, v8, 1 -; RV64-i64-NEXT: fcvt.l.s a0, fa4 -; RV64-i64-NEXT: vfmv.f.s fa4, v10 +; RV64-i64-NEXT: vfmv.f.s fa5, v12 ; RV64-i64-NEXT: fcvt.l.s a1, fa5 -; RV64-i64-NEXT: sd a1, 240(sp) -; RV64-i64-NEXT: flw fa5, 116(sp) -; RV64-i64-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-i64-NEXT: vslidedown.vi v14, v8, 7 -; RV64-i64-NEXT: fcvt.l.s a1, fa4 -; RV64-i64-NEXT: vfmv.f.s fa4, v11 +; RV64-i64-NEXT: vfmv.f.s fa5, v13 ; RV64-i64-NEXT: fcvt.l.s a2, fa5 -; RV64-i64-NEXT: sd a2, 232(sp) -; RV64-i64-NEXT: flw fa5, 112(sp) -; RV64-i64-NEXT: fcvt.l.s a2, fa4 -; RV64-i64-NEXT: vfmv.f.s fa4, v12 -; RV64-i64-NEXT: vslidedown.vi v10, v8, 6 +; RV64-i64-NEXT: vfmv.f.s fa5, v14 ; RV64-i64-NEXT: fcvt.l.s a3, fa5 -; RV64-i64-NEXT: sd a3, 224(sp) -; RV64-i64-NEXT: flw fa5, 108(sp) -; RV64-i64-NEXT: fcvt.l.s a3, fa4 -; RV64-i64-NEXT: vfmv.f.s fa4, v14 -; RV64-i64-NEXT: vslidedown.vi v12, v8, 5 +; RV64-i64-NEXT: vfmv.f.s fa5, v16 ; RV64-i64-NEXT: fcvt.l.s a4, fa5 -; RV64-i64-NEXT: sd a4, 216(sp) -; RV64-i64-NEXT: flw fa5, 104(sp) -; RV64-i64-NEXT: fcvt.l.s a4, fa4 -; RV64-i64-NEXT: vfmv.f.s fa4, v10 -; RV64-i64-NEXT: fcvt.l.s a5, fa4 +; RV64-i64-NEXT: vfmv.f.s fa5, v18 +; RV64-i64-NEXT: fcvt.l.s a5, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v20 ; RV64-i64-NEXT: fcvt.l.s a6, fa5 -; RV64-i64-NEXT: sd a6, 208(sp) +; RV64-i64-NEXT: flw fa5, 124(sp) +; RV64-i64-NEXT: fcvt.l.s a7, fa5 +; RV64-i64-NEXT: sd a7, 248(sp) +; RV64-i64-NEXT: flw fa5, 120(sp) +; RV64-i64-NEXT: fcvt.l.s a7, fa5 +; RV64-i64-NEXT: sd a7, 240(sp) +; RV64-i64-NEXT: flw fa5, 116(sp) +; RV64-i64-NEXT: fcvt.l.s a7, fa5 +; RV64-i64-NEXT: sd a7, 232(sp) +; RV64-i64-NEXT: flw fa5, 112(sp) +; RV64-i64-NEXT: fcvt.l.s a7, fa5 +; RV64-i64-NEXT: sd a7, 224(sp) +; RV64-i64-NEXT: flw fa5, 108(sp) +; RV64-i64-NEXT: fcvt.l.s a7, fa5 +; RV64-i64-NEXT: sd a7, 216(sp) +; RV64-i64-NEXT: flw fa5, 104(sp) +; RV64-i64-NEXT: fcvt.l.s a7, fa5 +; RV64-i64-NEXT: sd a7, 208(sp) ; RV64-i64-NEXT: flw fa5, 100(sp) -; RV64-i64-NEXT: vfmv.f.s fa4, v12 -; RV64-i64-NEXT: fcvt.l.s a6, fa4 -; RV64-i64-NEXT: vslidedown.vi v8, v8, 4 ; RV64-i64-NEXT: fcvt.l.s a7, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 ; RV64-i64-NEXT: sd a7, 200(sp) @@ -877,26 +880,27 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) { ; RV64-i64-NEXT: .cfi_def_cfa s0, 0 ; RV64-i64-NEXT: andi sp, sp, -64 ; RV64-i64-NEXT: mv a0, sp +; RV64-i64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-i64-NEXT: vfmv.f.s fa5, v8 +; RV64-i64-NEXT: vslidedown.vi v12, v8, 1 +; RV64-i64-NEXT: vsetivli zero, 1, e64, m2, ta, ma +; RV64-i64-NEXT: vslidedown.vi v14, v8, 3 ; RV64-i64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-i64-NEXT: vse64.v v8, (a0) -; RV64-i64-NEXT: fld fa5, 56(sp) -; RV64-i64-NEXT: vfmv.f.s fa4, v8 -; RV64-i64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-i64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-i64-NEXT: fcvt.l.d a0, fa4 -; RV64-i64-NEXT: fcvt.l.d a1, fa5 -; RV64-i64-NEXT: sd a1, 120(sp) -; RV64-i64-NEXT: fld fa5, 48(sp) -; RV64-i64-NEXT: vfmv.f.s fa4, v10 ; RV64-i64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-i64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-i64-NEXT: fcvt.l.d a1, fa4 +; RV64-i64-NEXT: vslidedown.vi v8, v8, 2 +; RV64-i64-NEXT: fcvt.l.d a0, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v12 +; RV64-i64-NEXT: fcvt.l.d a1, fa5 +; RV64-i64-NEXT: vfmv.f.s fa5, v14 ; RV64-i64-NEXT: fcvt.l.d a2, fa5 -; RV64-i64-NEXT: sd a2, 112(sp) +; RV64-i64-NEXT: fld fa5, 56(sp) +; RV64-i64-NEXT: fcvt.l.d a3, fa5 +; RV64-i64-NEXT: sd a3, 120(sp) +; RV64-i64-NEXT: fld fa5, 48(sp) +; RV64-i64-NEXT: fcvt.l.d a3, fa5 +; RV64-i64-NEXT: sd a3, 112(sp) ; RV64-i64-NEXT: fld fa5, 40(sp) -; RV64-i64-NEXT: vfmv.f.s fa4, v10 -; RV64-i64-NEXT: fcvt.l.d a2, fa4 -; RV64-i64-NEXT: vslidedown.vi v8, v8, 2 ; RV64-i64-NEXT: fcvt.l.d a3, fa5 ; RV64-i64-NEXT: vfmv.f.s fa5, v8 ; RV64-i64-NEXT: sd a3, 104(sp) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll index c29ccd45528b8..a258818539258 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll @@ -141,8 +141,8 @@ define <4 x i1> @buildvec_mask_nonconst_v4i1(i1 %x, i1 %y) { ; CHECK-LABEL: buildvec_mask_nonconst_v4i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v0, 3 ; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vmv.v.i v0, 3 ; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-NEXT: vand.vi v8, v8, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 @@ -151,8 +151,8 @@ define <4 x i1> @buildvec_mask_nonconst_v4i1(i1 %x, i1 %y) { ; ZVE32F-LABEL: buildvec_mask_nonconst_v4i1: ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; ZVE32F-NEXT: vmv.v.i v0, 3 ; ZVE32F-NEXT: vmv.v.x v8, a1 +; ZVE32F-NEXT: vmv.v.i v0, 3 ; ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; ZVE32F-NEXT: vand.vi v8, v8, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 @@ -245,10 +245,10 @@ define <8 x i1> @buildvec_mask_v8i1() { define <8 x i1> @buildvec_mask_nonconst_v8i1(i1 %x, i1 %y) { ; CHECK-LABEL: buildvec_mask_nonconst_v8i1: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 19 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a2 ; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: li a1, 19 +; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-NEXT: vand.vi v8, v8, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 @@ -256,10 +256,10 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1(i1 %x, i1 %y) { ; ; ZVE32F-LABEL: buildvec_mask_nonconst_v8i1: ; ZVE32F: # %bb.0: -; ZVE32F-NEXT: li a2, 19 ; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; ZVE32F-NEXT: vmv.s.x v0, a2 ; ZVE32F-NEXT: vmv.v.x v8, a1 +; ZVE32F-NEXT: li a1, 19 +; ZVE32F-NEXT: vmv.s.x v0, a1 ; ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; ZVE32F-NEXT: vand.vi v8, v8, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 @@ -282,12 +282,12 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) { ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vslide1down.vx v9, v8, a0 ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 ; CHECK-NEXT: vslide1down.vx v9, v9, a0 ; CHECK-NEXT: vslide1down.vx v8, v8, zero ; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 @@ -299,12 +299,12 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) { ; ZVE32F-NEXT: vmv.v.x v8, a0 ; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 ; ZVE32F-NEXT: li a0, 1 -; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a0 ; ZVE32F-NEXT: vslide1down.vx v8, v8, zero ; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; ZVE32F-NEXT: vand.vi v8, v8, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 @@ -327,12 +327,12 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 % ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vslide1down.vx v9, v8, a0 ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslide1down.vx v8, v8, a3 ; CHECK-NEXT: vslide1down.vx v9, v9, a0 ; CHECK-NEXT: vslide1down.vx v8, v8, zero ; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a2 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 @@ -344,12 +344,12 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 % ; ZVE32F-NEXT: vmv.v.x v8, a0 ; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 ; ZVE32F-NEXT: li a0, 1 -; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a0 ; ZVE32F-NEXT: vslide1down.vx v8, v8, zero ; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; ZVE32F-NEXT: vand.vi v8, v8, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 @@ -370,13 +370,13 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslide1down.vx v9, v8, a0 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 ; CHECK-NEXT: vslide1down.vx v9, v9, a1 ; CHECK-NEXT: vslide1down.vx v8, v8, a1 +; CHECK-NEXT: vmv.v.i v0, 15 ; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 1 ; CHECK-NEXT: vmsne.vi v0, v8, 0 @@ -386,13 +386,13 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize { ; ZVE32F: # %bb.0: ; ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; ZVE32F-NEXT: vmv.v.x v8, a0 -; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslide1down.vx v9, v8, a0 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 ; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1 +; ZVE32F-NEXT: vmv.v.i v0, 15 ; ZVE32F-NEXT: vslidedown.vi v8, v9, 4, v0.t ; ZVE32F-NEXT: vand.vi v8, v8, 1 ; ZVE32F-NEXT: vmsne.vi v0, v8, 0 @@ -528,12 +528,12 @@ define <128 x i1> @buildvec_mask_v128i1() { ; RV64: # %bb.0: ; RV64-NEXT: lui a0, %hi(.LCPI20_0) ; RV64-NEXT: ld a0, %lo(.LCPI20_0)(a0) -; RV64-NEXT: lui a1, %hi(.LCPI20_1) -; RV64-NEXT: ld a1, %lo(.LCPI20_1)(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vmv.v.x v0, a0 +; RV64-NEXT: lui a0, %hi(.LCPI20_1) +; RV64-NEXT: ld a0, %lo(.LCPI20_1)(a0) ; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma -; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: ret ; ; ZVE32F-LABEL: buildvec_mask_v128i1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll index 979785dd2c024..84486a96873d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll @@ -24,11 +24,11 @@ define void @splat_zeros_v2i1(ptr %x) { define void @splat_v1i1(ptr %x, i1 %y) { ; CHECK-LABEL: splat_v1i1: ; CHECK: # %bb.0: -; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.s.x v8, a1 -; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: andi a1, a1, 1 +; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 @@ -48,11 +48,11 @@ define void @splat_v1i1_icmp(ptr %x, i32 signext %y, i32 signext %z) { ; CHECK-LABEL: splat_v1i1_icmp: ; CHECK: # %bb.0: ; CHECK-NEXT: xor a1, a1, a2 -; CHECK-NEXT: seqz a1, a1 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.s.x v8, a1 -; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 @@ -84,9 +84,9 @@ define void @splat_v4i1(ptr %x, i1 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: andi a1, a1, 1 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v9, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 232a364e87f0e..29e7179b65acb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -406,7 +406,6 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t -; RV32ZVE32F-NEXT: sw zero, 12(a0) ; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 ; RV32ZVE32F-NEXT: vmv.x.s a1, v9 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 @@ -415,6 +414,7 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x ; RV32ZVE32F-NEXT: sw a1, 0(a0) ; RV32ZVE32F-NEXT: sw zero, 4(a0) ; RV32ZVE32F-NEXT: sw a2, 8(a0) +; RV32ZVE32F-NEXT: sw zero, 12(a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i64: @@ -732,9 +732,9 @@ define <8 x i8> @mgather_baseidx_v8i8(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 ; RV64ZVE32F-NEXT: .LBB12_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB12_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -755,9 +755,9 @@ define <8 x i8> @mgather_baseidx_v8i8(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-NEXT: .LBB12_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB12_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -1433,9 +1433,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: .LBB23_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB23_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1458,9 +1458,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-NEXT: .LBB23_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB23_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -1582,9 +1582,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: .LBB24_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB24_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1607,9 +1607,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-NEXT: .LBB24_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB24_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -1732,9 +1732,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: .LBB25_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB25_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1758,9 +1758,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-NEXT: .LBB25_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB25_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -1885,9 +1885,9 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: .LBB26_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB26_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1909,9 +1909,9 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-NEXT: .LBB26_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB26_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -2149,15 +2149,15 @@ define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 ; ; RV32ZVE32F-LABEL: mgather_v2i32_zextload_v2i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: addi a1, a0, 8 ; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; RV32ZVE32F-NEXT: vluxei32.v v9, (zero), v8, v0.t ; RV32ZVE32F-NEXT: sw zero, 4(a0) ; RV32ZVE32F-NEXT: sw zero, 12(a0) +; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vse32.v v9, (a0) -; RV32ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV32ZVE32F-NEXT: vse32.v v8, (a1) +; RV32ZVE32F-NEXT: addi a0, a0, 8 +; RV32ZVE32F-NEXT: vse32.v v8, (a0) ; RV32ZVE32F-NEXT: ret ; ; RV64ZVE32F-LABEL: mgather_v2i32_zextload_v2i64: @@ -2480,9 +2480,9 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: .LBB35_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2505,9 +2505,9 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB35_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB35_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -2628,9 +2628,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: .LBB36_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB36_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2653,9 +2653,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB36_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB36_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -2780,9 +2780,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: .LBB37_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB37_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2806,9 +2806,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB37_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB37_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -2937,9 +2937,9 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: .LBB38_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB38_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2962,9 +2962,9 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB38_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB38_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -3087,9 +3087,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: .LBB39_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB39_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -3112,9 +3112,9 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB39_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB39_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -3240,9 +3240,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: .LBB40_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB40_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -3266,9 +3266,9 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB40_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB40_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -3391,9 +3391,9 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: .LBB41_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB41_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -3415,9 +3415,9 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 ; RV64ZVE32F-NEXT: .LBB41_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB41_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -4109,9 +4109,9 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: beqz a3, .LBB48_7 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB48_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a3) @@ -4272,9 +4272,9 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: .LBB48_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB48_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 ; RV64ZVE32F-NEXT: vmv.x.s a6, v8 @@ -4320,8 +4320,8 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB48_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: beqz t2, .LBB48_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -4386,9 +4386,9 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: beqz a3, .LBB49_7 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB49_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a3) @@ -4549,9 +4549,9 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: .LBB49_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB49_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 ; RV64ZVE32F-NEXT: vmv.x.s a6, v8 @@ -4597,8 +4597,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB49_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: beqz t2, .LBB49_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -4665,9 +4665,9 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: beqz a3, .LBB50_7 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB50_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a3) @@ -4830,9 +4830,9 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: .LBB50_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB50_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 ; RV64ZVE32F-NEXT: vmv.x.s a6, v8 @@ -4882,8 +4882,8 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB50_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: beqz t2, .LBB50_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -4950,10 +4950,10 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: li a1, 8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 -; RV32ZVE32F-NEXT: beqz a3, .LBB51_7 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB51_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v10 @@ -5116,9 +5116,9 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: .LBB51_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB51_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 ; RV64ZVE32F-NEXT: vmv.x.s a6, v8 @@ -5164,8 +5164,8 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB51_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: beqz t2, .LBB51_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -5229,10 +5229,10 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: li a1, 8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 -; RV32ZVE32F-NEXT: beqz a3, .LBB52_7 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB52_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v10 @@ -5395,9 +5395,9 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: .LBB52_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB52_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 ; RV64ZVE32F-NEXT: vmv.x.s a6, v8 @@ -5443,8 +5443,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB52_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: beqz t2, .LBB52_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -5510,10 +5510,10 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: li a1, 8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32ZVE32F-NEXT: vwmaccu.vx v10, a1, v8 -; RV32ZVE32F-NEXT: beqz a3, .LBB53_7 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB53_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v10 @@ -5678,9 +5678,9 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: .LBB53_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB53_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 ; RV64ZVE32F-NEXT: vmv.x.s a6, v8 @@ -5730,8 +5730,8 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB53_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: beqz t2, .LBB53_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -5797,10 +5797,10 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: beqz a3, .LBB54_7 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB54_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a3) @@ -5962,9 +5962,9 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV64ZVE32F-NEXT: .LBB54_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB54_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 ; RV64ZVE32F-NEXT: vmv.x.s a6, v8 @@ -6010,8 +6010,8 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB54_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: beqz t2, .LBB54_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -6074,10 +6074,10 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: beqz a3, .LBB55_7 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB55_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a3) @@ -6239,9 +6239,9 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: .LBB55_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB55_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 ; RV64ZVE32F-NEXT: vmv.x.s a6, v8 @@ -6287,8 +6287,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB55_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: beqz t2, .LBB55_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -6352,10 +6352,10 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a3, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: beqz a3, .LBB56_7 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB56_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a3) @@ -6519,9 +6519,9 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: .LBB56_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: beqz a6, .LBB56_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 ; RV64ZVE32F-NEXT: vmv.x.s a6, v8 @@ -6571,8 +6571,8 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB56_14: # %else14 -; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: beqz t2, .LBB56_17 ; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -6654,9 +6654,9 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a7 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: andi a2, t0, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: beqz a2, .LBB57_7 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: beqz a1, .LBB57_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a2) @@ -7073,14 +7073,14 @@ define <4 x bfloat> @mgather_truemask_v4bf16(<4 x ptr> %ptrs, <4 x bfloat> %pass ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: ld a3, 16(a0) ; RV64ZVE32F-NEXT: ld a0, 24(a0) -; RV64ZVE32F-NEXT: lh a1, 0(a1) ; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: lh a1, 0(a1) ; RV64ZVE32F-NEXT: lh a3, 0(a3) -; RV64ZVE32F-NEXT: lh a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-NEXT: lh a0, 0(a0) ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32F-NEXT: ret %v = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1), <4 x bfloat> %passthru) @@ -7271,9 +7271,9 @@ define <8 x bfloat> @mgather_baseidx_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, <8 x ; RV64ZVE32F-NEXT: .LBB64_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB64_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -7296,9 +7296,9 @@ define <8 x bfloat> @mgather_baseidx_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-NEXT: .LBB64_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB64_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -7420,9 +7420,9 @@ define <8 x bfloat> @mgather_baseidx_sext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: .LBB65_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB65_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -7445,9 +7445,9 @@ define <8 x bfloat> @mgather_baseidx_sext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-NEXT: .LBB65_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB65_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -7570,9 +7570,9 @@ define <8 x bfloat> @mgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: .LBB66_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB66_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -7596,9 +7596,9 @@ define <8 x bfloat> @mgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-NEXT: .LBB66_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB66_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -7723,9 +7723,9 @@ define <8 x bfloat> @mgather_baseidx_v8bf16(ptr %base, <8 x i16> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: .LBB67_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB67_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -7747,9 +7747,9 @@ define <8 x bfloat> @mgather_baseidx_v8bf16(ptr %base, <8 x i16> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-NEXT: .LBB67_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB67_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -8085,14 +8085,14 @@ define <4 x half> @mgather_truemask_v4f16(<4 x ptr> %ptrs, <4 x half> %passthru) ; RV64ZVE32F-ZVFHMIN-NEXT: ld a2, 8(a0) ; RV64ZVE32F-ZVFHMIN-NEXT: ld a3, 16(a0) ; RV64ZVE32F-ZVFHMIN-NEXT: ld a0, 24(a0) -; RV64ZVE32F-ZVFHMIN-NEXT: lh a1, 0(a1) ; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: lh a1, 0(a1) ; RV64ZVE32F-ZVFHMIN-NEXT: lh a3, 0(a3) -; RV64ZVE32F-ZVFHMIN-NEXT: lh a0, 0(a0) ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-ZVFHMIN-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a0, 0(a0) ; RV64ZVE32F-ZVFHMIN-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32F-ZVFHMIN-NEXT: ret %v = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1), <4 x half> %passthru) @@ -8376,9 +8376,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-ZVFH-NEXT: .LBB74_4: # %else2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB74_14 ; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 @@ -8401,9 +8401,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-ZVFH-NEXT: .LBB74_9: # %else14 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB74_11 ; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 @@ -8500,9 +8500,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_4: # %else2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB74_14 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -8525,9 +8525,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_9: # %else14 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB74_11 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 @@ -8649,9 +8649,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFH-NEXT: .LBB75_4: # %else2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB75_14 ; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 @@ -8674,9 +8674,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-ZVFH-NEXT: .LBB75_9: # %else14 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB75_11 ; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 @@ -8773,9 +8773,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_4: # %else2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB75_14 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -8798,9 +8798,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_9: # %else14 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB75_11 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 @@ -8923,9 +8923,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFH-NEXT: .LBB76_4: # %else2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB76_14 ; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 @@ -8949,9 +8949,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-ZVFH-NEXT: .LBB76_9: # %else14 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB76_11 ; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 @@ -9055,9 +9055,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_4: # %else2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB76_14 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -9081,9 +9081,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_9: # %else14 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB76_11 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 @@ -9208,9 +9208,9 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-ZVFH-NEXT: .LBB77_4: # %else2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB77_14 ; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 @@ -9232,9 +9232,9 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-ZVFH-NEXT: .LBB77_9: # %else14 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB77_11 ; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 @@ -9324,9 +9324,9 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_4: # %else2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB77_14 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -9348,9 +9348,9 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 5 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_9: # %else14 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB77_11 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 @@ -9791,9 +9791,9 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i ; RV64ZVE32F-NEXT: .LBB84_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB84_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -9816,9 +9816,9 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB84_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB84_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -9939,9 +9939,9 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: .LBB85_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB85_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -9964,9 +9964,9 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB85_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB85_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -10091,9 +10091,9 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: .LBB86_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB86_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -10117,9 +10117,9 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB86_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB86_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -10248,9 +10248,9 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x ; RV64ZVE32F-NEXT: .LBB87_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB87_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -10273,9 +10273,9 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB87_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB87_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -10398,9 +10398,9 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: .LBB88_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB88_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -10423,9 +10423,9 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB88_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB88_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -10551,9 +10551,9 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: .LBB89_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB89_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -10577,9 +10577,9 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 ; RV64ZVE32F-NEXT: .LBB89_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB89_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -10702,9 +10702,9 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> % ; RV64ZVE32F-NEXT: .LBB90_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB90_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -10726,9 +10726,9 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> % ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 ; RV64ZVE32F-NEXT: .LBB90_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: beqz a2, .LBB90_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -11308,9 +11308,9 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez a3, .LBB97_10 +; RV32ZVE32F-NEXT: andi a1, a2, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB97_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a2, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB97_11 @@ -11420,9 +11420,9 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x ; RV64ZVE32F-NEXT: .LBB97_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB97_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -11440,8 +11440,8 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) ; RV64ZVE32F-NEXT: .LBB97_9: # %else14 -; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: beqz a3, .LBB97_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -11523,9 +11523,9 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez a3, .LBB98_10 +; RV32ZVE32F-NEXT: andi a1, a2, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB98_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a2, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB98_11 @@ -11635,9 +11635,9 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: .LBB98_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB98_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -11655,8 +11655,8 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) ; RV64ZVE32F-NEXT: .LBB98_9: # %else14 -; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: beqz a3, .LBB98_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -11740,9 +11740,9 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez a3, .LBB99_10 +; RV32ZVE32F-NEXT: andi a1, a2, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB99_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a2, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB99_11 @@ -11854,9 +11854,9 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: .LBB99_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB99_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -11875,8 +11875,8 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) ; RV64ZVE32F-NEXT: .LBB99_9: # %else14 -; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: beqz a3, .LBB99_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -11963,10 +11963,10 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 ; RV32ZVE32F-NEXT: li a2, 8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a3, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32ZVE32F-NEXT: vwmaccus.vx v10, a2, v8 -; RV32ZVE32F-NEXT: bnez a3, .LBB100_10 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB100_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB100_11 @@ -12078,9 +12078,9 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 ; RV64ZVE32F-NEXT: .LBB100_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB100_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -12098,8 +12098,8 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) ; RV64ZVE32F-NEXT: .LBB100_9: # %else14 -; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: beqz a3, .LBB100_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -12180,10 +12180,10 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV32ZVE32F-NEXT: li a2, 8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a3, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32ZVE32F-NEXT: vwmaccus.vx v10, a2, v8 -; RV32ZVE32F-NEXT: bnez a3, .LBB101_10 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB101_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB101_11 @@ -12295,9 +12295,9 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV64ZVE32F-NEXT: .LBB101_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB101_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -12315,8 +12315,8 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) ; RV64ZVE32F-NEXT: .LBB101_9: # %else14 -; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: beqz a3, .LBB101_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -12399,10 +12399,10 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV32ZVE32F-NEXT: li a2, 8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a3, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32ZVE32F-NEXT: vwmaccu.vx v10, a2, v8 -; RV32ZVE32F-NEXT: bnez a3, .LBB102_10 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: bnez a2, .LBB102_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB102_11 @@ -12516,9 +12516,9 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV64ZVE32F-NEXT: .LBB102_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB102_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -12537,8 +12537,8 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) ; RV64ZVE32F-NEXT: .LBB102_9: # %else14 -; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: beqz a3, .LBB102_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -12624,10 +12624,10 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez a3, .LBB103_10 +; RV32ZVE32F-NEXT: andi a1, a2, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB103_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a2, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB103_11 @@ -12738,9 +12738,9 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8 ; RV64ZVE32F-NEXT: .LBB103_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB103_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -12758,8 +12758,8 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) ; RV64ZVE32F-NEXT: .LBB103_9: # %else14 -; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: beqz a3, .LBB103_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -12839,10 +12839,10 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez a3, .LBB104_10 +; RV32ZVE32F-NEXT: andi a1, a2, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB104_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a2, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB104_11 @@ -12953,9 +12953,9 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: .LBB104_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB104_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -12973,8 +12973,8 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) ; RV64ZVE32F-NEXT: .LBB104_9: # %else14 -; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: beqz a3, .LBB104_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -13055,10 +13055,10 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v0 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez a3, .LBB105_10 +; RV32ZVE32F-NEXT: andi a1, a2, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB105_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a2, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB105_11 @@ -13171,9 +13171,9 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: .LBB105_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB105_14 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 @@ -13192,8 +13192,8 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) ; RV64ZVE32F-NEXT: .LBB105_9: # %else14 -; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 64 ; RV64ZVE32F-NEXT: beqz a3, .LBB105_11 ; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 @@ -13295,9 +13295,9 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1> ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a6 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: andi a3, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez a3, .LBB106_10 +; RV32ZVE32F-NEXT: andi a1, a2, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB106_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a2, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB106_11 @@ -13528,9 +13528,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m ; RV64ZVE32F-NEXT: .LBB107_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB107_25 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -13546,9 +13546,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v11, a2 ; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 4 ; RV64ZVE32F-NEXT: .LBB107_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB107_10 ; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -13560,9 +13560,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 ; RV64ZVE32F-NEXT: .LBB107_10: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB107_27 ; RV64ZVE32F-NEXT: # %bb.11: # %else17 ; RV64ZVE32F-NEXT: andi a2, a1, 128 @@ -13585,9 +13585,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m ; RV64ZVE32F-NEXT: .LBB107_15: # %else26 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: bnez a2, .LBB107_30 ; RV64ZVE32F-NEXT: # %bb.16: # %else29 ; RV64ZVE32F-NEXT: slli a2, a1, 52 @@ -13608,9 +13608,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m ; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 13 ; RV64ZVE32F-NEXT: .LBB107_20: # %else38 -; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: bgez a2, .LBB107_22 ; RV64ZVE32F-NEXT: # %bb.21: # %cond.load40 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -13741,15 +13741,14 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64V-NEXT: vsext.vf8 v16, v8 ; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64V-NEXT: vslidedown.vi v12, v10, 16 -; RV64V-NEXT: vslidedown.vi v14, v8, 16 -; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64V-NEXT: vslidedown.vi v8, v0, 2 +; RV64V-NEXT: vslidedown.vi v8, v8, 16 ; RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64V-NEXT: vluxei64.v v10, (a0), v16, v0.t ; RV64V-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64V-NEXT: vsext.vf8 v16, v14 -; RV64V-NEXT: vmv1r.v v0, v8 -; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64V-NEXT: vsext.vf8 v16, v8 +; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64V-NEXT: vslidedown.vi v0, v0, 2 +; RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t ; RV64V-NEXT: li a0, 32 ; RV64V-NEXT: vsetvli zero, a0, e8, m2, ta, ma @@ -13784,9 +13783,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: .LBB108_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB108_49 ; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -13802,9 +13801,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 ; RV64ZVE32F-NEXT: .LBB108_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB108_10 ; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -13816,9 +13815,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 5 ; RV64ZVE32F-NEXT: .LBB108_10: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB108_51 ; RV64ZVE32F-NEXT: # %bb.11: # %else17 ; RV64ZVE32F-NEXT: andi a2, a1, 128 @@ -13841,9 +13840,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: .LBB108_15: # %else26 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: beqz a2, .LBB108_17 ; RV64ZVE32F-NEXT: # %bb.16: # %cond.load28 ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 @@ -13865,9 +13864,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 11 ; RV64ZVE32F-NEXT: .LBB108_19: # %else32 -; RV64ZVE32F-NEXT: slli a2, a1, 51 ; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 16 +; RV64ZVE32F-NEXT: slli a2, a1, 51 ; RV64ZVE32F-NEXT: bgez a2, .LBB108_21 ; RV64ZVE32F-NEXT: # %bb.20: # %cond.load34 ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 @@ -13889,9 +13888,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 13 ; RV64ZVE32F-NEXT: .LBB108_23: # %else38 -; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v13, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: bltz a2, .LBB108_54 ; RV64ZVE32F-NEXT: # %bb.24: # %else41 ; RV64ZVE32F-NEXT: slli a2, a1, 48 @@ -13914,9 +13913,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: .LBB108_28: # %else50 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: slli a2, a1, 45 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 45 ; RV64ZVE32F-NEXT: bltz a2, .LBB108_57 ; RV64ZVE32F-NEXT: # %bb.29: # %else53 ; RV64ZVE32F-NEXT: slli a2, a1, 44 @@ -13932,9 +13931,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 20 ; RV64ZVE32F-NEXT: .LBB108_32: # %else59 -; RV64ZVE32F-NEXT: slli a2, a1, 42 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVE32F-NEXT: slli a2, a1, 42 ; RV64ZVE32F-NEXT: bgez a2, .LBB108_34 ; RV64ZVE32F-NEXT: # %bb.33: # %cond.load61 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -13946,9 +13945,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vsetivli zero, 22, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 21 ; RV64ZVE32F-NEXT: .LBB108_34: # %else62 -; RV64ZVE32F-NEXT: slli a2, a1, 41 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 41 ; RV64ZVE32F-NEXT: bltz a2, .LBB108_59 ; RV64ZVE32F-NEXT: # %bb.35: # %else65 ; RV64ZVE32F-NEXT: slli a2, a1, 40 @@ -13971,9 +13970,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: .LBB108_39: # %else74 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: slli a2, a1, 37 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 37 ; RV64ZVE32F-NEXT: bltz a2, .LBB108_62 ; RV64ZVE32F-NEXT: # %bb.40: # %else77 ; RV64ZVE32F-NEXT: slli a2, a1, 36 @@ -13994,9 +13993,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vsetivli zero, 30, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 29 ; RV64ZVE32F-NEXT: .LBB108_44: # %else86 -; RV64ZVE32F-NEXT: slli a2, a1, 33 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 33 ; RV64ZVE32F-NEXT: bgez a2, .LBB108_46 ; RV64ZVE32F-NEXT: # %bb.45: # %cond.load88 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -14279,8 +14278,8 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) { ; RV32: # %bb.0: ; RV32-NEXT: li a1, -512 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: vmv.v.x v8, a1 +; RV32-NEXT: vmv.v.i v0, 5 ; RV32-NEXT: vmerge.vim v8, v8, 0, v0 ; RV32-NEXT: vluxei32.v v8, (a0), v8 ; RV32-NEXT: ret @@ -14288,10 +14287,11 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) { ; RV64V-LABEL: mgather_narrow_edge_case: ; RV64V: # %bb.0: ; RV64V-NEXT: li a1, -512 +; RV64V-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64V-NEXT: vmv.v.x v8, a1 ; RV64V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV64V-NEXT: vmv.v.i v0, 5 ; RV64V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64V-NEXT: vmv.v.x v8, a1 ; RV64V-NEXT: vmerge.vim v10, v8, 0, v0 ; RV64V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64V-NEXT: vluxei64.v v8, (a0), v10 @@ -14302,8 +14302,8 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) { ; RV64ZVE32F-NEXT: lw a1, -512(a0) ; RV64ZVE32F-NEXT: lw a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.v.i v0, 5 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vmv.v.i v0, 5 ; RV64ZVE32F-NEXT: vmerge.vxm v8, v8, a0, v0 ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i32, ptr %base, <4 x i8> @@ -14337,36 +14337,36 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV32-NEXT: lbu a0, 1(a0) ; RV32-NEXT: vmv.x.s a7, v10 ; RV32-NEXT: vmv.x.s t0, v8 -; RV32-NEXT: lbu t1, 0(a1) -; RV32-NEXT: lbu a1, 1(a1) -; RV32-NEXT: lbu t2, 0(a2) -; RV32-NEXT: lbu a2, 1(a2) ; RV32-NEXT: slli a0, a0, 8 ; RV32-NEXT: or a0, a0, a6 -; RV32-NEXT: lbu a6, 0(a3) -; RV32-NEXT: lbu a3, 1(a3) +; RV32-NEXT: lbu a6, 0(a1) +; RV32-NEXT: lbu a1, 1(a1) ; RV32-NEXT: slli a1, a1, 8 -; RV32-NEXT: or a1, a1, t1 -; RV32-NEXT: lbu t1, 0(a4) -; RV32-NEXT: lbu a4, 1(a4) +; RV32-NEXT: or a1, a1, a6 +; RV32-NEXT: lbu a6, 0(a2) +; RV32-NEXT: lbu a2, 1(a2) ; RV32-NEXT: slli a2, a2, 8 -; RV32-NEXT: or a2, a2, t2 -; RV32-NEXT: lbu t2, 0(a5) -; RV32-NEXT: lbu a5, 1(a5) +; RV32-NEXT: or a2, a2, a6 +; RV32-NEXT: lbu a6, 0(a3) +; RV32-NEXT: lbu a3, 1(a3) ; RV32-NEXT: slli a3, a3, 8 ; RV32-NEXT: or a3, a3, a6 -; RV32-NEXT: lbu a6, 0(a7) -; RV32-NEXT: lbu a7, 1(a7) +; RV32-NEXT: lbu a6, 0(a4) +; RV32-NEXT: lbu a4, 1(a4) ; RV32-NEXT: slli a4, a4, 8 -; RV32-NEXT: or a4, a4, t1 -; RV32-NEXT: lbu t1, 0(t0) -; RV32-NEXT: lbu t0, 1(t0) +; RV32-NEXT: or a4, a4, a6 +; RV32-NEXT: lbu a6, 0(a5) +; RV32-NEXT: lbu a5, 1(a5) ; RV32-NEXT: slli a5, a5, 8 -; RV32-NEXT: or a5, a5, t2 +; RV32-NEXT: or a5, a5, a6 +; RV32-NEXT: lbu a6, 0(a7) +; RV32-NEXT: lbu a7, 1(a7) ; RV32-NEXT: slli a7, a7, 8 ; RV32-NEXT: or a6, a7, a6 +; RV32-NEXT: lbu a7, 0(t0) +; RV32-NEXT: lbu t0, 1(t0) ; RV32-NEXT: slli t0, t0, 8 -; RV32-NEXT: or a7, t0, t1 +; RV32-NEXT: or a7, t0, a7 ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV32-NEXT: vmv.v.x v8, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a1 @@ -14375,8 +14375,8 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV32-NEXT: vslide1down.vx v9, v9, a5 ; RV32-NEXT: vslide1down.vx v10, v8, a3 ; RV32-NEXT: vslide1down.vx v8, v9, a6 -; RV32-NEXT: vmv.v.i v0, 15 ; RV32-NEXT: vslide1down.vx v8, v8, a7 +; RV32-NEXT: vmv.v.i v0, 15 ; RV32-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV32-NEXT: ret ; @@ -14450,8 +14450,8 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV64V-NEXT: vmv.v.x v8, a3 ; RV64V-NEXT: vslide1down.vx v8, v8, a0 ; RV64V-NEXT: vslide1down.vx v8, v8, a1 -; RV64V-NEXT: vmv.v.i v0, 15 ; RV64V-NEXT: vslide1down.vx v8, v8, a2 +; RV64V-NEXT: vmv.v.i v0, 15 ; RV64V-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64V-NEXT: addi sp, s0, -128 ; RV64V-NEXT: .cfi_def_cfa sp, 128 @@ -14475,38 +14475,38 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: lbu t0, 13(a0) ; RV64ZVE32F-NEXT: slli a2, a2, 8 ; RV64ZVE32F-NEXT: slli a4, a4, 8 +; RV64ZVE32F-NEXT: slli a6, a6, 8 ; RV64ZVE32F-NEXT: or a1, a2, a1 ; RV64ZVE32F-NEXT: or a3, a4, a3 -; RV64ZVE32F-NEXT: lbu a2, 16(a0) -; RV64ZVE32F-NEXT: lbu a4, 17(a0) -; RV64ZVE32F-NEXT: lbu t1, 20(a0) -; RV64ZVE32F-NEXT: lbu t2, 21(a0) -; RV64ZVE32F-NEXT: slli a6, a6, 8 -; RV64ZVE32F-NEXT: or a5, a6, a5 +; RV64ZVE32F-NEXT: or a2, a6, a5 +; RV64ZVE32F-NEXT: lbu a4, 16(a0) +; RV64ZVE32F-NEXT: lbu a5, 17(a0) +; RV64ZVE32F-NEXT: lbu a6, 20(a0) +; RV64ZVE32F-NEXT: lbu t1, 21(a0) ; RV64ZVE32F-NEXT: slli t0, t0, 8 -; RV64ZVE32F-NEXT: slli a4, a4, 8 -; RV64ZVE32F-NEXT: slli t2, t2, 8 -; RV64ZVE32F-NEXT: or a6, t0, a7 -; RV64ZVE32F-NEXT: or a2, a4, a2 -; RV64ZVE32F-NEXT: lbu a4, 24(a0) -; RV64ZVE32F-NEXT: lbu a7, 25(a0) -; RV64ZVE32F-NEXT: or t0, t2, t1 +; RV64ZVE32F-NEXT: slli a5, a5, 8 +; RV64ZVE32F-NEXT: slli t1, t1, 8 +; RV64ZVE32F-NEXT: or a7, t0, a7 +; RV64ZVE32F-NEXT: or a4, a5, a4 +; RV64ZVE32F-NEXT: or a5, t1, a6 +; RV64ZVE32F-NEXT: lbu a6, 24(a0) +; RV64ZVE32F-NEXT: lbu t0, 25(a0) ; RV64ZVE32F-NEXT: lbu t1, 28(a0) ; RV64ZVE32F-NEXT: lbu a0, 29(a0) -; RV64ZVE32F-NEXT: slli a7, a7, 8 -; RV64ZVE32F-NEXT: or a4, a7, a4 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 +; RV64ZVE32F-NEXT: slli t0, t0, 8 +; RV64ZVE32F-NEXT: or a6, t0, a6 ; RV64ZVE32F-NEXT: slli a0, a0, 8 ; RV64ZVE32F-NEXT: or a0, a0, t1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a2 +; RV64ZVE32F-NEXT: vmv.v.x v9, a4 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, t0 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a4 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a6 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> @@ -14541,7 +14541,6 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 24(a0) ; RV64ZVE32F-NEXT: lh a0, 26(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 @@ -14550,6 +14549,7 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> @@ -14586,7 +14586,6 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 28(a0) ; RV64ZVE32F-NEXT: lh a0, 30(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 @@ -14595,6 +14594,7 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> @@ -14631,7 +14631,6 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 20(a0) ; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a3 ; RV64ZVE32F-NEXT: vmv.v.x v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 @@ -14640,6 +14639,7 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a6 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> @@ -14676,7 +14676,6 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 12(a0) ; RV64ZVE32F-NEXT: lh a0, 14(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a3 ; RV64ZVE32F-NEXT: vmv.v.x v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a4 @@ -14685,6 +14684,7 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a6 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> @@ -14720,7 +14720,6 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 4(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a5 ; RV64ZVE32F-NEXT: vmv.v.x v9, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 @@ -14729,6 +14728,7 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> @@ -14767,7 +14767,6 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 4(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a5 ; RV64ZVE32F-NEXT: vmv.v.x v9, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 @@ -14776,6 +14775,7 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> @@ -14806,23 +14806,23 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) { ; ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned2: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: lh a1, 2(a0) -; RV64ZVE32F-NEXT: lh a2, 4(a0) -; RV64ZVE32F-NEXT: lh a3, 6(a0) -; RV64ZVE32F-NEXT: lh a4, 8(a0) -; RV64ZVE32F-NEXT: lh a5, 10(a0) -; RV64ZVE32F-NEXT: lh a6, 18(a0) -; RV64ZVE32F-NEXT: lh a0, 20(a0) +; RV64ZVE32F-NEXT: lh a1, 10(a0) +; RV64ZVE32F-NEXT: lh a2, 18(a0) +; RV64ZVE32F-NEXT: lh a3, 20(a0) +; RV64ZVE32F-NEXT: lh a4, 2(a0) +; RV64ZVE32F-NEXT: lh a5, 4(a0) +; RV64ZVE32F-NEXT: lh a6, 6(a0) +; RV64ZVE32F-NEXT: lh a0, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 -; RV64ZVE32F-NEXT: vmv.v.x v8, a1 -; RV64ZVE32F-NEXT: vmv.v.x v9, a4 +; RV64ZVE32F-NEXT: vmv.v.x v8, a4 +; RV64ZVE32F-NEXT: vmv.v.x v9, a0 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a5 +; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a1 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a5 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a6 -; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a2 -; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a0 -; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a3 +; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a6 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> @@ -14865,7 +14865,6 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 20(a0) ; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 @@ -14874,6 +14873,7 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> @@ -14913,7 +14913,6 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 20(a0) ; RV64ZVE32F-NEXT: lh a0, 22(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 @@ -14922,6 +14921,7 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32> @@ -14970,7 +14970,6 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 4(a0) ; RV64ZVE32F-NEXT: lh a0, 6(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a2 @@ -14979,6 +14978,7 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a4 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> @@ -15018,7 +15018,6 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { ; RV64ZVE32F-NEXT: lh a7, 12(a0) ; RV64ZVE32F-NEXT: lh a0, 14(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vmv.v.x v8, a1 ; RV64ZVE32F-NEXT: vmv.v.x v9, a5 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a3 @@ -15027,6 +15026,7 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) { ; RV64ZVE32F-NEXT: vslide1down.vx v9, v9, a7 ; RV64ZVE32F-NEXT: vslide1down.vx v10, v8, a2 ; RV64ZVE32F-NEXT: vslide1down.vx v8, v9, a0 +; RV64ZVE32F-NEXT: vmv.v.i v0, 15 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 4, v0.t ; RV64ZVE32F-NEXT: ret %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll index f72b08a405246..f27c8e5d664e3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll @@ -242,9 +242,9 @@ define <32 x double> @masked_load_v32f64(ptr %a, <32 x i1> %mask) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 2 -; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a0), v0.t ; CHECK-NEXT: ret @@ -278,12 +278,12 @@ define <64 x float> @masked_load_v64f32(ptr %a, <64 x i1> %mask) { ; CHECK-LABEL: masked_load_v64f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v0, 4 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 4 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0), v0.t ; CHECK-NEXT: ret %load = call <64 x float> @llvm.masked.load.v64f32(ptr %a, i32 8, <64 x i1> %mask, <64 x float> undef) @@ -294,12 +294,12 @@ define <128 x bfloat> @masked_load_v128bf16(ptr %a, <128 x i1> %mask) { ; CHECK-LABEL: masked_load_v128bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0), v0.t ; CHECK-NEXT: ret %load = call <128 x bfloat> @llvm.masked.load.v128bf16(ptr %a, i32 8, <128 x i1> %mask, <128 x bfloat> undef) @@ -310,12 +310,12 @@ define <128 x half> @masked_load_v128f16(ptr %a, <128 x i1> %mask) { ; CHECK-LABEL: masked_load_v128f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0), v0.t ; CHECK-NEXT: ret %load = call <128 x half> @llvm.masked.load.v128f16(ptr %a, i32 8, <128 x i1> %mask, <128 x half> undef) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll index 69903d77084bf..6e613917f8cd9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll @@ -240,9 +240,9 @@ define <32 x i64> @masked_load_v32i64(ptr %a, <32 x i1> %mask) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 2 -; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a0), v0.t ; CHECK-NEXT: ret @@ -276,12 +276,12 @@ define <64 x i32> @masked_load_v64i32(ptr %a, <64 x i1> %mask) { ; CHECK-LABEL: masked_load_v64i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v0, 4 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 4 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0), v0.t ; CHECK-NEXT: ret %load = call <64 x i32> @llvm.masked.load.v64i32(ptr %a, i32 8, <64 x i1> %mask, <64 x i32> undef) @@ -303,12 +303,12 @@ define <128 x i16> @masked_load_v128i16(ptr %a, <128 x i1> %mask) { ; CHECK-LABEL: masked_load_v128i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0), v0.t ; CHECK-NEXT: ret %load = call <128 x i16> @llvm.masked.load.v128i16(ptr %a, i32 8, <128 x i1> %mask, <128 x i16> undef) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 7354f9afa9a71..7358fd4cfa0f8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -123,9 +123,9 @@ define void @mscatter_v2i16_truncstore_v2i8(<2 x i16> %val, <2 x ptr> %ptrs, <2 ; RV64ZVE32F-LABEL: mscatter_v2i16_truncstore_v2i8: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vnsrl.wi v8, v8, 0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 ; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: vnsrl.wi v8, v8, 0 ; RV64ZVE32F-NEXT: bnez a3, .LBB2_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 @@ -181,8 +181,8 @@ define void @mscatter_v2i32_truncstore_v2i8(<2 x i32> %val, <2 x ptr> %ptrs, <2 ; RV64ZVE32F-NEXT: vnsrl.wi v8, v8, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: vnsrl.wi v8, v8, 0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB3_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 @@ -229,11 +229,11 @@ define void @mscatter_v2i64_truncstore_v2i8(<2 x i64> %val, <2 x ptr> %ptrs, <2 ; ; RV32ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i8: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a1, 8(a0) -; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: lw a1, 0(a0) +; RV32ZVE32F-NEXT: lw a0, 8(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32ZVE32F-NEXT: vmv.s.x v9, a1 -; RV32ZVE32F-NEXT: vmv.s.x v10, a0 +; RV32ZVE32F-NEXT: vmv.s.x v9, a0 +; RV32ZVE32F-NEXT: vmv.s.x v10, a1 ; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV32ZVE32F-NEXT: vsoxei32.v v10, (zero), v8, v0.t ; RV32ZVE32F-NEXT: ret @@ -244,8 +244,8 @@ define void @mscatter_v2i64_truncstore_v2i8(<2 x i64> %val, <2 x ptr> %ptrs, <2 ; RV64ZVE32F-NEXT: vmv.s.x v9, a1 ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vmv.x.s a0, v0 -; RV64ZVE32F-NEXT: andi a1, a0, 1 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a1, a0, 1 ; RV64ZVE32F-NEXT: bnez a1, .LBB4_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a0, 2 @@ -513,9 +513,9 @@ define void @mscatter_baseidx_v8i8(<8 x i8> %val, ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: .LBB9_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB9_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -535,9 +535,9 @@ define void @mscatter_baseidx_v8i8(<8 x i8> %val, ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: .LBB9_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB9_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -698,11 +698,11 @@ define void @mscatter_v2i32_truncstore_v2i16(<2 x i32> %val, <2 x ptr> %ptrs, <2 ; ; RV64ZVE32F-LABEL: mscatter_v2i32_truncstore_v2i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vnsrl.wi v8, v8, 0 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: bnez a3, .LBB12_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 @@ -745,11 +745,11 @@ define void @mscatter_v2i64_truncstore_v2i16(<2 x i64> %val, <2 x ptr> %ptrs, <2 ; ; RV32ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i16: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a1, 8(a0) -; RV32ZVE32F-NEXT: lw a0, 0(a0) +; RV32ZVE32F-NEXT: lw a1, 0(a0) +; RV32ZVE32F-NEXT: lw a0, 8(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.s.x v9, a1 -; RV32ZVE32F-NEXT: vmv.s.x v10, a0 +; RV32ZVE32F-NEXT: vmv.s.x v9, a0 +; RV32ZVE32F-NEXT: vmv.s.x v10, a1 ; RV32ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV32ZVE32F-NEXT: vsoxei32.v v10, (zero), v8, v0.t ; RV32ZVE32F-NEXT: ret @@ -761,9 +761,9 @@ define void @mscatter_v2i64_truncstore_v2i16(<2 x i64> %val, <2 x ptr> %ptrs, <2 ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a0, v0 -; RV64ZVE32F-NEXT: andi a1, a0, 1 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: andi a1, a0, 1 ; RV64ZVE32F-NEXT: bnez a1, .LBB13_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a0, 2 @@ -1035,9 +1035,9 @@ define void @mscatter_baseidx_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: .LBB18_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB18_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1058,9 +1058,9 @@ define void @mscatter_baseidx_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: .LBB18_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB18_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -1168,9 +1168,9 @@ define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: .LBB19_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB19_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1191,9 +1191,9 @@ define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: .LBB19_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB19_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -1302,9 +1302,9 @@ define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: .LBB20_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB20_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1326,9 +1326,9 @@ define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: .LBB20_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB20_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -1440,9 +1440,9 @@ define void @mscatter_baseidx_v8i16(<8 x i16> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: .LBB21_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB21_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1463,9 +1463,9 @@ define void @mscatter_baseidx_v8i16(<8 x i16> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: .LBB21_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB21_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -1638,10 +1638,10 @@ define void @mscatter_v2i64_truncstore_v2i32(<2 x i64> %val, <2 x ptr> %ptrs, <2 ; RV64ZVE32F-NEXT: vmv.v.x v8, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a0, v0 -; RV64ZVE32F-NEXT: andi a4, a0, 1 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 -; RV64ZVE32F-NEXT: bnez a4, .LBB24_3 +; RV64ZVE32F-NEXT: andi a1, a0, 1 +; RV64ZVE32F-NEXT: bnez a1, .LBB24_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB24_4 @@ -1915,9 +1915,9 @@ define void @mscatter_baseidx_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: .LBB29_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB29_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -1939,9 +1939,9 @@ define void @mscatter_baseidx_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB29_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB29_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -2052,9 +2052,9 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: .LBB30_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB30_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2076,9 +2076,9 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB30_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB30_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -2193,9 +2193,9 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: .LBB31_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB31_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2218,9 +2218,9 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB31_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB31_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -2339,9 +2339,9 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> % ; RV64ZVE32F-NEXT: .LBB32_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB32_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2363,9 +2363,9 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> % ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB32_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB32_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -2478,9 +2478,9 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: .LBB33_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB33_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2502,9 +2502,9 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB33_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB33_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -2620,9 +2620,9 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: .LBB34_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB34_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2645,9 +2645,9 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB34_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB34_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -2761,9 +2761,9 @@ define void @mscatter_baseidx_v8i32(<8 x i32> %val, ptr %base, <8 x i32> %idxs, ; RV64ZVE32F-NEXT: .LBB35_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -2785,9 +2785,9 @@ define void @mscatter_baseidx_v8i32(<8 x i32> %val, ptr %base, <8 x i32> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB35_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -3425,13 +3425,13 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id ; RV32ZVE32F-NEXT: lw t6, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi s2, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez s2, .LBB42_10 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB42_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, t0, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB42_11 @@ -3560,9 +3560,9 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: .LBB42_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB42_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a5, 8 @@ -3580,8 +3580,8 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: .LBB42_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: bnez a0, .LBB42_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a5, -128 @@ -3675,13 +3675,13 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV32ZVE32F-NEXT: lw t6, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi s2, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez s2, .LBB43_10 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB43_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, t0, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB43_11 @@ -3810,9 +3810,9 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: .LBB43_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB43_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a5, 8 @@ -3830,8 +3830,8 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: .LBB43_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: bnez a0, .LBB43_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a5, -128 @@ -3927,13 +3927,13 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV32ZVE32F-NEXT: lw t6, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vzext.vf4 v10, v8 -; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi s2, t0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez s2, .LBB44_10 +; RV32ZVE32F-NEXT: andi a1, t0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB44_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, t0, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB44_11 @@ -4032,7 +4032,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) ; RV64ZVE32F-NEXT: ld a3, 48(a0) ; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) @@ -4040,8 +4040,8 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: ld a7, 24(a0) ; RV64ZVE32F-NEXT: ld a6, 32(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi t2, a4, 1 +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi t2, a5, 1 ; RV64ZVE32F-NEXT: beqz t2, .LBB44_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) @@ -4051,7 +4051,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB44_2: # %else -; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB44_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -4064,18 +4064,18 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: .LBB44_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: andi a0, a5, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_13 ; RV64ZVE32F-NEXT: .LBB44_6: # %else6 -; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: andi a0, a5, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_14 ; RV64ZVE32F-NEXT: .LBB44_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: andi a0, a5, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB44_9 ; RV64ZVE32F-NEXT: .LBB44_8: # %cond.store9 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 @@ -4083,13 +4083,13 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: andi a0, a0, 255 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: .LBB44_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a4, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: andi a0, a5, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_16 ; RV64ZVE32F-NEXT: .LBB44_11: # %else14 ; RV64ZVE32F-NEXT: ret @@ -4099,7 +4099,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: andi a0, a5, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB44_6 ; RV64ZVE32F-NEXT: .LBB44_13: # %cond.store5 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -4108,7 +4108,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: andi a0, a5, 16 ; RV64ZVE32F-NEXT: beqz a0, .LBB44_7 ; RV64ZVE32F-NEXT: .LBB44_14: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 @@ -4116,7 +4116,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: andi a0, a5, 32 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_8 ; RV64ZVE32F-NEXT: j .LBB44_9 ; RV64ZVE32F-NEXT: .LBB44_15: # %cond.store11 @@ -4125,7 +4125,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: andi a0, a5, -128 ; RV64ZVE32F-NEXT: beqz a0, .LBB44_11 ; RV64ZVE32F-NEXT: .LBB44_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -4186,13 +4186,13 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV32ZVE32F-NEXT: lw t5, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li s1, 8 +; RV32ZVE32F-NEXT: li a1, 8 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s2, a1, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccus.vx v10, s1, v8 -; RV32ZVE32F-NEXT: bnez s2, .LBB45_10 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB45_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB45_11 @@ -4323,9 +4323,9 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV64ZVE32F-NEXT: .LBB45_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB45_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a5, 8 @@ -4343,8 +4343,8 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: .LBB45_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: bnez a0, .LBB45_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a5, -128 @@ -4438,13 +4438,13 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: lw t5, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li s1, 8 +; RV32ZVE32F-NEXT: li a1, 8 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s2, a1, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccus.vx v10, s1, v8 -; RV32ZVE32F-NEXT: bnez s2, .LBB46_10 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB46_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB46_11 @@ -4575,9 +4575,9 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: .LBB46_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB46_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a5, 8 @@ -4595,8 +4595,8 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: .LBB46_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: bnez a0, .LBB46_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a5, -128 @@ -4692,13 +4692,13 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: lw t5, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li s1, 8 +; RV32ZVE32F-NEXT: li a1, 8 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV32ZVE32F-NEXT: vwmaccu.vx v10, a1, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s2, a1, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccu.vx v10, s1, v8 -; RV32ZVE32F-NEXT: bnez s2, .LBB47_10 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB47_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB47_11 @@ -4798,7 +4798,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a5, 40(a0) +; RV64ZVE32F-NEXT: ld a4, 40(a0) ; RV64ZVE32F-NEXT: ld a3, 48(a0) ; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) @@ -4806,8 +4806,8 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: ld a7, 24(a0) ; RV64ZVE32F-NEXT: ld a6, 32(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi t2, a4, 1 +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi t2, a5, 1 ; RV64ZVE32F-NEXT: beqz t2, .LBB47_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) @@ -4818,7 +4818,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB47_2: # %else -; RV64ZVE32F-NEXT: andi a0, a4, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB47_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma @@ -4831,18 +4831,18 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: .LBB47_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB47_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: andi a0, a5, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB47_13 ; RV64ZVE32F-NEXT: .LBB47_6: # %else6 -; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: andi a0, a5, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB47_14 ; RV64ZVE32F-NEXT: .LBB47_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: andi a0, a5, 32 ; RV64ZVE32F-NEXT: beqz a0, .LBB47_9 ; RV64ZVE32F-NEXT: .LBB47_8: # %cond.store9 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 @@ -4850,13 +4850,13 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: slli a0, a0, 48 ; RV64ZVE32F-NEXT: srli a0, a0, 45 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: .LBB47_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a4, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: bnez a0, .LBB47_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: andi a0, a5, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB47_16 ; RV64ZVE32F-NEXT: .LBB47_11: # %else14 ; RV64ZVE32F-NEXT: ret @@ -4866,7 +4866,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: srli a0, a0, 45 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: andi a0, a5, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB47_6 ; RV64ZVE32F-NEXT: .LBB47_13: # %cond.store5 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -4875,7 +4875,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: srli a0, a0, 45 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: andi a0, a5, 16 ; RV64ZVE32F-NEXT: beqz a0, .LBB47_7 ; RV64ZVE32F-NEXT: .LBB47_14: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 @@ -4883,7 +4883,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: srli a0, a0, 45 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: andi a0, a5, 32 ; RV64ZVE32F-NEXT: bnez a0, .LBB47_8 ; RV64ZVE32F-NEXT: j .LBB47_9 ; RV64ZVE32F-NEXT: .LBB47_15: # %cond.store11 @@ -4892,7 +4892,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: srli a0, a0, 45 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: andi a0, a5, -128 ; RV64ZVE32F-NEXT: beqz a0, .LBB47_11 ; RV64ZVE32F-NEXT: .LBB47_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -4943,42 +4943,41 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV32ZVE32F-NEXT: lw a7, 44(a0) ; RV32ZVE32F-NEXT: lw a4, 48(a0) ; RV32ZVE32F-NEXT: lw a5, 52(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 28(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 36(a0) -; RV32ZVE32F-NEXT: lw s0, 8(a0) -; RV32ZVE32F-NEXT: lw s1, 12(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw t6, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi s2, t0, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez s2, .LBB48_10 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB48_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a0, t0, 2 +; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB48_11 ; RV32ZVE32F-NEXT: .LBB48_2: # %else2 -; RV32ZVE32F-NEXT: andi a0, t0, 4 +; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: bnez a0, .LBB48_12 ; RV32ZVE32F-NEXT: .LBB48_3: # %else4 -; RV32ZVE32F-NEXT: andi a0, t0, 8 +; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB48_13 ; RV32ZVE32F-NEXT: .LBB48_4: # %else6 -; RV32ZVE32F-NEXT: andi a0, t0, 16 +; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: bnez a0, .LBB48_14 ; RV32ZVE32F-NEXT: .LBB48_5: # %else8 -; RV32ZVE32F-NEXT: andi a0, t0, 32 +; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: bnez a0, .LBB48_15 ; RV32ZVE32F-NEXT: .LBB48_6: # %else10 -; RV32ZVE32F-NEXT: andi a0, t0, 64 +; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: bnez a0, .LBB48_16 ; RV32ZVE32F-NEXT: .LBB48_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, t0, -128 +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_9 ; RV32ZVE32F-NEXT: .LBB48_8: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -4998,44 +4997,45 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB48_10: # %cond.store ; RV32ZVE32F-NEXT: .cfi_restore_state -; RV32ZVE32F-NEXT: lw a1, 0(a0) +; RV32ZVE32F-NEXT: lw s1, 0(a0) ; RV32ZVE32F-NEXT: lw a0, 4(a0) +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 -; RV32ZVE32F-NEXT: sw a1, 0(s2) +; RV32ZVE32F-NEXT: sw s1, 0(s2) ; RV32ZVE32F-NEXT: sw a0, 4(s2) -; RV32ZVE32F-NEXT: andi a0, t0, 2 +; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_2 ; RV32ZVE32F-NEXT: .LBB48_11: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw s0, 0(a0) -; RV32ZVE32F-NEXT: sw s1, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 4 +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_3 ; RV32ZVE32F-NEXT: .LBB48_12: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t6, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 8 +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_4 ; RV32ZVE32F-NEXT: .LBB48_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 16 +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_5 ; RV32ZVE32F-NEXT: .LBB48_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 32 +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_6 ; RV32ZVE32F-NEXT: .LBB48_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5043,7 +5043,7 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a6, 0(a0) ; RV32ZVE32F-NEXT: sw a7, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 64 +; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB48_7 ; RV32ZVE32F-NEXT: .LBB48_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5051,7 +5051,7 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a4, 0(a0) ; RV32ZVE32F-NEXT: sw a5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, -128 +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB48_8 ; RV32ZVE32F-NEXT: j .LBB48_9 ; @@ -5088,9 +5088,9 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV64ZVE32F-NEXT: .LBB48_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB48_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a5, 8 @@ -5108,8 +5108,8 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: .LBB48_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: bnez a0, .LBB48_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a5, -128 @@ -5193,42 +5193,41 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: lw a7, 44(a0) ; RV32ZVE32F-NEXT: lw a4, 48(a0) ; RV32ZVE32F-NEXT: lw a5, 52(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 28(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 36(a0) -; RV32ZVE32F-NEXT: lw s0, 8(a0) -; RV32ZVE32F-NEXT: lw s1, 12(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw t6, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi s2, t0, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez s2, .LBB49_10 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB49_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a0, t0, 2 +; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB49_11 ; RV32ZVE32F-NEXT: .LBB49_2: # %else2 -; RV32ZVE32F-NEXT: andi a0, t0, 4 +; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: bnez a0, .LBB49_12 ; RV32ZVE32F-NEXT: .LBB49_3: # %else4 -; RV32ZVE32F-NEXT: andi a0, t0, 8 +; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB49_13 ; RV32ZVE32F-NEXT: .LBB49_4: # %else6 -; RV32ZVE32F-NEXT: andi a0, t0, 16 +; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: bnez a0, .LBB49_14 ; RV32ZVE32F-NEXT: .LBB49_5: # %else8 -; RV32ZVE32F-NEXT: andi a0, t0, 32 +; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: bnez a0, .LBB49_15 ; RV32ZVE32F-NEXT: .LBB49_6: # %else10 -; RV32ZVE32F-NEXT: andi a0, t0, 64 +; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: bnez a0, .LBB49_16 ; RV32ZVE32F-NEXT: .LBB49_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, t0, -128 +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_9 ; RV32ZVE32F-NEXT: .LBB49_8: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5248,44 +5247,45 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB49_10: # %cond.store ; RV32ZVE32F-NEXT: .cfi_restore_state -; RV32ZVE32F-NEXT: lw a1, 0(a0) +; RV32ZVE32F-NEXT: lw s1, 0(a0) ; RV32ZVE32F-NEXT: lw a0, 4(a0) +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 -; RV32ZVE32F-NEXT: sw a1, 0(s2) +; RV32ZVE32F-NEXT: sw s1, 0(s2) ; RV32ZVE32F-NEXT: sw a0, 4(s2) -; RV32ZVE32F-NEXT: andi a0, t0, 2 +; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_2 ; RV32ZVE32F-NEXT: .LBB49_11: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw s0, 0(a0) -; RV32ZVE32F-NEXT: sw s1, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 4 +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_3 ; RV32ZVE32F-NEXT: .LBB49_12: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t6, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 8 +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_4 ; RV32ZVE32F-NEXT: .LBB49_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 16 +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_5 ; RV32ZVE32F-NEXT: .LBB49_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 32 +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_6 ; RV32ZVE32F-NEXT: .LBB49_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5293,7 +5293,7 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a6, 0(a0) ; RV32ZVE32F-NEXT: sw a7, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 64 +; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB49_7 ; RV32ZVE32F-NEXT: .LBB49_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5301,7 +5301,7 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a4, 0(a0) ; RV32ZVE32F-NEXT: sw a5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, -128 +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB49_8 ; RV32ZVE32F-NEXT: j .LBB49_9 ; @@ -5338,9 +5338,9 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: .LBB49_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB49_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a5, 8 @@ -5358,8 +5358,8 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: .LBB49_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: bnez a0, .LBB49_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a5, -128 @@ -5444,42 +5444,41 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: lw a7, 44(a0) ; RV32ZVE32F-NEXT: lw a4, 48(a0) ; RV32ZVE32F-NEXT: lw a5, 52(a0) -; RV32ZVE32F-NEXT: lw t3, 24(a0) -; RV32ZVE32F-NEXT: lw t4, 28(a0) -; RV32ZVE32F-NEXT: lw t1, 32(a0) -; RV32ZVE32F-NEXT: lw t2, 36(a0) -; RV32ZVE32F-NEXT: lw s0, 8(a0) -; RV32ZVE32F-NEXT: lw s1, 12(a0) -; RV32ZVE32F-NEXT: lw t5, 16(a0) -; RV32ZVE32F-NEXT: lw t6, 20(a0) +; RV32ZVE32F-NEXT: lw t2, 24(a0) +; RV32ZVE32F-NEXT: lw t3, 28(a0) +; RV32ZVE32F-NEXT: lw t0, 32(a0) +; RV32ZVE32F-NEXT: lw t1, 36(a0) +; RV32ZVE32F-NEXT: lw t6, 8(a0) +; RV32ZVE32F-NEXT: lw s0, 12(a0) +; RV32ZVE32F-NEXT: lw t4, 16(a0) +; RV32ZVE32F-NEXT: lw t5, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi s2, t0, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez s2, .LBB50_10 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: andi s1, a1, 1 +; RV32ZVE32F-NEXT: bnez s1, .LBB50_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a0, t0, 2 +; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB50_11 ; RV32ZVE32F-NEXT: .LBB50_2: # %else2 -; RV32ZVE32F-NEXT: andi a0, t0, 4 +; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: bnez a0, .LBB50_12 ; RV32ZVE32F-NEXT: .LBB50_3: # %else4 -; RV32ZVE32F-NEXT: andi a0, t0, 8 +; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB50_13 ; RV32ZVE32F-NEXT: .LBB50_4: # %else6 -; RV32ZVE32F-NEXT: andi a0, t0, 16 +; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: bnez a0, .LBB50_14 ; RV32ZVE32F-NEXT: .LBB50_5: # %else8 -; RV32ZVE32F-NEXT: andi a0, t0, 32 +; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: bnez a0, .LBB50_15 ; RV32ZVE32F-NEXT: .LBB50_6: # %else10 -; RV32ZVE32F-NEXT: andi a0, t0, 64 +; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: bnez a0, .LBB50_16 ; RV32ZVE32F-NEXT: .LBB50_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, t0, -128 +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_9 ; RV32ZVE32F-NEXT: .LBB50_8: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5499,44 +5498,45 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB50_10: # %cond.store ; RV32ZVE32F-NEXT: .cfi_restore_state -; RV32ZVE32F-NEXT: lw a1, 0(a0) +; RV32ZVE32F-NEXT: lw s1, 0(a0) ; RV32ZVE32F-NEXT: lw a0, 4(a0) +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 -; RV32ZVE32F-NEXT: sw a1, 0(s2) +; RV32ZVE32F-NEXT: sw s1, 0(s2) ; RV32ZVE32F-NEXT: sw a0, 4(s2) -; RV32ZVE32F-NEXT: andi a0, t0, 2 +; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_2 ; RV32ZVE32F-NEXT: .LBB50_11: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw s0, 0(a0) -; RV32ZVE32F-NEXT: sw s1, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 4 +; RV32ZVE32F-NEXT: sw t6, 0(a0) +; RV32ZVE32F-NEXT: sw s0, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_3 ; RV32ZVE32F-NEXT: .LBB50_12: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t5, 0(a0) -; RV32ZVE32F-NEXT: sw t6, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 8 +; RV32ZVE32F-NEXT: sw t4, 0(a0) +; RV32ZVE32F-NEXT: sw t5, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_4 ; RV32ZVE32F-NEXT: .LBB50_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t3, 0(a0) -; RV32ZVE32F-NEXT: sw t4, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 16 +; RV32ZVE32F-NEXT: sw t2, 0(a0) +; RV32ZVE32F-NEXT: sw t3, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_5 ; RV32ZVE32F-NEXT: .LBB50_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 -; RV32ZVE32F-NEXT: sw t1, 0(a0) -; RV32ZVE32F-NEXT: sw t2, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 32 +; RV32ZVE32F-NEXT: sw t0, 0(a0) +; RV32ZVE32F-NEXT: sw t1, 4(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_6 ; RV32ZVE32F-NEXT: .LBB50_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5544,7 +5544,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a6, 0(a0) ; RV32ZVE32F-NEXT: sw a7, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, 64 +; RV32ZVE32F-NEXT: andi a0, a1, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB50_7 ; RV32ZVE32F-NEXT: .LBB50_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma @@ -5552,7 +5552,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a4, 0(a0) ; RV32ZVE32F-NEXT: sw a5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, t0, -128 +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB50_8 ; RV32ZVE32F-NEXT: j .LBB50_9 ; @@ -5591,9 +5591,9 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: .LBB50_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB50_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a5, 8 @@ -5612,8 +5612,8 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: .LBB50_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 64 ; RV64ZVE32F-NEXT: bnez a0, .LBB50_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a5, -128 @@ -5745,9 +5745,9 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s4 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, s5 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: andi s2, a2, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: bnez s2, .LBB51_10 +; RV32ZVE32F-NEXT: andi a1, a2, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB51_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a2, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB51_11 @@ -5928,8 +5928,8 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs, ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB51_10: # %cond.store ; RV64ZVE32F-NEXT: .cfi_restore_state -; RV64ZVE32F-NEXT: ld a2, 0(a2) ; RV64ZVE32F-NEXT: ld a0, 0(a0) +; RV64ZVE32F-NEXT: ld a2, 0(a2) ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a1, a2 ; RV64ZVE32F-NEXT: sd a0, 0(a2) @@ -6350,9 +6350,9 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: .LBB58_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB58_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -6375,9 +6375,9 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB58_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB58_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -6501,9 +6501,9 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: .LBB59_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB59_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -6526,9 +6526,9 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB59_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB59_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -6653,9 +6653,9 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: .LBB60_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB60_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -6679,9 +6679,9 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB60_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB60_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -6809,9 +6809,9 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: .LBB61_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB61_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -6834,9 +6834,9 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB61_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB61_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -7453,9 +7453,9 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFH-NEXT: .LBB68_4: # %else2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB68_12 ; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 @@ -7476,9 +7476,9 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-ZVFH-NEXT: .LBB68_9: # %else10 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB68_15 ; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 @@ -7567,9 +7567,9 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_4: # %else2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB68_12 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -7592,9 +7592,9 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_9: # %else10 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB68_15 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 @@ -7714,9 +7714,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFH-NEXT: .LBB69_4: # %else2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB69_12 ; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 @@ -7737,9 +7737,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-ZVFH-NEXT: .LBB69_9: # %else10 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB69_15 ; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 @@ -7828,9 +7828,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_4: # %else2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB69_12 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -7853,9 +7853,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_9: # %else10 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB69_15 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 @@ -7976,9 +7976,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFH-NEXT: .LBB70_4: # %else2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB70_12 ; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 @@ -8000,9 +8000,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-ZVFH-NEXT: .LBB70_9: # %else10 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB70_15 ; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 @@ -8098,9 +8098,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_4: # %else2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB70_12 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -8124,9 +8124,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_9: # %else10 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB70_15 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 @@ -8250,9 +8250,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFH-NEXT: .LBB71_4: # %else2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB71_12 ; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 @@ -8273,9 +8273,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 5 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-ZVFH-NEXT: .LBB71_9: # %else10 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB71_15 ; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 @@ -8363,9 +8363,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_4: # %else2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB71_12 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 @@ -8388,9 +8388,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_9: # %else10 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB71_15 ; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 @@ -8795,9 +8795,9 @@ define void @mscatter_baseidx_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> % ; RV64ZVE32F-NEXT: .LBB78_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB78_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -8819,9 +8819,9 @@ define void @mscatter_baseidx_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> % ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB78_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB78_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -8932,9 +8932,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: .LBB79_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB79_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -8956,9 +8956,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB79_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB79_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -9073,9 +9073,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: .LBB80_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB80_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -9098,9 +9098,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB80_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB80_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -9219,9 +9219,9 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> ; RV64ZVE32F-NEXT: .LBB81_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB81_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -9243,9 +9243,9 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB81_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB81_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -9358,9 +9358,9 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: .LBB82_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB82_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -9382,9 +9382,9 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB82_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB82_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -9500,9 +9500,9 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: .LBB83_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB83_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -9525,9 +9525,9 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB83_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB83_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -9641,9 +9641,9 @@ define void @mscatter_baseidx_v8f32(<8 x float> %val, ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: .LBB84_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB84_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -9665,9 +9665,9 @@ define void @mscatter_baseidx_v8f32(<8 x float> %val, ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB84_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB84_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -10179,9 +10179,9 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8> ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 -; RV32ZVE32F-NEXT: bnez a2, .LBB91_9 +; RV32ZVE32F-NEXT: andi a0, a1, 1 +; RV32ZVE32F-NEXT: bnez a0, .LBB91_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB91_10 @@ -10283,9 +10283,9 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: .LBB91_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB91_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -10303,8 +10303,8 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB91_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB91_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -10379,9 +10379,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 -; RV32ZVE32F-NEXT: bnez a2, .LBB92_9 +; RV32ZVE32F-NEXT: andi a0, a1, 1 +; RV32ZVE32F-NEXT: bnez a0, .LBB92_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB92_10 @@ -10483,9 +10483,9 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: .LBB92_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB92_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -10503,8 +10503,8 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB92_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB92_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -10581,9 +10581,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 -; RV32ZVE32F-NEXT: bnez a2, .LBB93_9 +; RV32ZVE32F-NEXT: andi a0, a1, 1 +; RV32ZVE32F-NEXT: bnez a0, .LBB93_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB93_10 @@ -10687,9 +10687,9 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: .LBB93_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB93_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -10708,8 +10708,8 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB93_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB93_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -10789,10 +10789,10 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16 ; RV32ZVE32F-NEXT: li a1, 8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a2, a0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 -; RV32ZVE32F-NEXT: bnez a2, .LBB94_9 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB94_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB94_10 @@ -10896,9 +10896,9 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16 ; RV64ZVE32F-NEXT: .LBB94_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB94_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -10916,8 +10916,8 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB94_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB94_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -10991,10 +10991,10 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV32ZVE32F-NEXT: li a1, 8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a2, a0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 -; RV32ZVE32F-NEXT: bnez a2, .LBB95_9 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB95_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB95_10 @@ -11098,9 +11098,9 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: .LBB95_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB95_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -11118,8 +11118,8 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB95_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB95_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -11195,10 +11195,10 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV32ZVE32F-NEXT: li a1, 8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a2, a0, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32ZVE32F-NEXT: vwmaccu.vx v10, a1, v8 -; RV32ZVE32F-NEXT: bnez a2, .LBB96_9 +; RV32ZVE32F-NEXT: andi a1, a0, 1 +; RV32ZVE32F-NEXT: bnez a1, .LBB96_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB96_10 @@ -11304,9 +11304,9 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: .LBB96_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB96_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -11325,8 +11325,8 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB96_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB96_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -11405,10 +11405,10 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i32 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 -; RV32ZVE32F-NEXT: bnez a2, .LBB97_9 +; RV32ZVE32F-NEXT: andi a0, a1, 1 +; RV32ZVE32F-NEXT: bnez a0, .LBB97_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB97_10 @@ -11511,9 +11511,9 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i32 ; RV64ZVE32F-NEXT: .LBB97_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB97_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -11531,8 +11531,8 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i32 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB97_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB97_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -11605,10 +11605,10 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 -; RV32ZVE32F-NEXT: bnez a2, .LBB98_9 +; RV32ZVE32F-NEXT: andi a0, a1, 1 +; RV32ZVE32F-NEXT: bnez a0, .LBB98_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB98_10 @@ -11711,9 +11711,9 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: .LBB98_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB98_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -11731,8 +11731,8 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB98_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB98_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -11806,10 +11806,10 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 -; RV32ZVE32F-NEXT: bnez a2, .LBB99_9 +; RV32ZVE32F-NEXT: andi a0, a1, 1 +; RV32ZVE32F-NEXT: bnez a0, .LBB99_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB99_10 @@ -11914,9 +11914,9 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: .LBB99_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB99_12 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -11935,8 +11935,8 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB99_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB99_15 ; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 @@ -12031,9 +12031,9 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a4 ; RV32ZVE32F-NEXT: vslide1down.vx v8, v8, a5 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 3 -; RV32ZVE32F-NEXT: andi a2, a1, 1 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 -; RV32ZVE32F-NEXT: bnez a2, .LBB100_9 +; RV32ZVE32F-NEXT: andi a0, a1, 1 +; RV32ZVE32F-NEXT: bnez a0, .LBB100_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB100_10 @@ -12244,9 +12244,9 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs, ; RV64ZVE32F-NEXT: .LBB101_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB101_25 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -12261,9 +12261,9 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 4 ; RV64ZVE32F-NEXT: vse8.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB101_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB101_10 ; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -12274,9 +12274,9 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 ; RV64ZVE32F-NEXT: vse8.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB101_10: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB101_27 ; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a2, a1, 128 @@ -12298,9 +12298,9 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs, ; RV64ZVE32F-NEXT: .LBB101_15: # %else18 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: bnez a2, .LBB101_30 ; RV64ZVE32F-NEXT: # %bb.16: # %else20 ; RV64ZVE32F-NEXT: slli a2, a1, 52 @@ -12320,9 +12320,9 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 13 ; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: .LBB101_20: # %else26 -; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: bgez a2, .LBB101_22 ; RV64ZVE32F-NEXT: # %bb.21: # %cond.store27 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -12443,11 +12443,11 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64V-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64V-NEXT: vslidedown.vi v8, v8, 16 ; RV64V-NEXT: vslidedown.vi v10, v10, 16 -; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64V-NEXT: vslidedown.vi v0, v0, 2 ; RV64V-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64V-NEXT: vsext.vf8 v16, v10 -; RV64V-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; RV64V-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64V-NEXT: vslidedown.vi v0, v0, 2 +; RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64V-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64V-NEXT: ret ; @@ -12476,9 +12476,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: .LBB102_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB102_49 ; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 @@ -12493,9 +12493,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB102_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 ; RV64ZVE32F-NEXT: beqz a2, .LBB102_10 ; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -12506,9 +12506,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 5 ; RV64ZVE32F-NEXT: vse8.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB102_10: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: bnez a2, .LBB102_51 ; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a2, a1, 128 @@ -12530,9 +12530,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: .LBB102_15: # %else18 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: beqz a2, .LBB102_17 ; RV64ZVE32F-NEXT: # %bb.16: # %cond.store19 ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 @@ -12552,9 +12552,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 11 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB102_19: # %else22 -; RV64ZVE32F-NEXT: slli a2, a1, 51 ; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 16 +; RV64ZVE32F-NEXT: slli a2, a1, 51 ; RV64ZVE32F-NEXT: bgez a2, .LBB102_21 ; RV64ZVE32F-NEXT: # %bb.20: # %cond.store23 ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 @@ -12574,9 +12574,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 13 ; RV64ZVE32F-NEXT: vse8.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB102_23: # %else26 -; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v13, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: bltz a2, .LBB102_54 ; RV64ZVE32F-NEXT: # %bb.24: # %else28 ; RV64ZVE32F-NEXT: slli a2, a1, 48 @@ -12599,9 +12599,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: .LBB102_28: # %else34 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: slli a2, a1, 45 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 45 ; RV64ZVE32F-NEXT: bltz a2, .LBB102_57 ; RV64ZVE32F-NEXT: # %bb.29: # %else36 ; RV64ZVE32F-NEXT: slli a2, a1, 44 @@ -12617,9 +12617,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB102_32: # %else40 -; RV64ZVE32F-NEXT: slli a2, a1, 42 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 8 +; RV64ZVE32F-NEXT: slli a2, a1, 42 ; RV64ZVE32F-NEXT: bgez a2, .LBB102_34 ; RV64ZVE32F-NEXT: # %bb.33: # %cond.store41 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -12631,9 +12631,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB102_34: # %else42 -; RV64ZVE32F-NEXT: slli a2, a1, 41 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 41 ; RV64ZVE32F-NEXT: bltz a2, .LBB102_59 ; RV64ZVE32F-NEXT: # %bb.35: # %else44 ; RV64ZVE32F-NEXT: slli a2, a1, 40 @@ -12656,9 +12656,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: .LBB102_39: # %else50 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 -; RV64ZVE32F-NEXT: slli a2, a1, 37 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 37 ; RV64ZVE32F-NEXT: bltz a2, .LBB102_62 ; RV64ZVE32F-NEXT: # %bb.40: # %else52 ; RV64ZVE32F-NEXT: slli a2, a1, 36 @@ -12679,9 +12679,9 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB102_44: # %else58 -; RV64ZVE32F-NEXT: slli a2, a1, 33 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: slli a2, a1, 33 ; RV64ZVE32F-NEXT: bgez a2, .LBB102_46 ; RV64ZVE32F-NEXT: # %bb.45: # %cond.store59 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll index ed6ec4d5659b1..6421d7c8022f4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll @@ -242,9 +242,9 @@ define void @masked_store_v32f64(<32 x double> %val, ptr %a, <32 x i1> %mask) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 2 -; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v16, (a0), v0.t ; CHECK-NEXT: ret @@ -278,12 +278,12 @@ define void @masked_store_v64f32(<64 x float> %val, ptr %a, <64 x i1> %mask) { ; CHECK-LABEL: masked_store_v64f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 4 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vse32.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 4 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vse32.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v64f32.p0(<64 x float> %val, ptr %a, i32 8, <64 x i1> %mask) @@ -294,12 +294,12 @@ define void @masked_store_v128bf16(<128 x bfloat> %val, ptr %a, <128 x i1> %mask ; CHECK-LABEL: masked_store_v128bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v128bf16.p0(<128 x bfloat> %val, ptr %a, i32 8, <128 x i1> %mask) @@ -310,12 +310,12 @@ define void @masked_store_v128f16(<128 x half> %val, ptr %a, <128 x i1> %mask) { ; CHECK-LABEL: masked_store_v128f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v128f16.p0(<128 x half> %val, ptr %a, i32 8, <128 x i1> %mask) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll index c3b10db115bae..7a9fc0ecd8bb0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll @@ -240,9 +240,9 @@ define void @masked_store_v32i64(<32 x i64> %val, ptr %a, <32 x i1> %mask) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 2 -; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v16, (a0), v0.t ; CHECK-NEXT: ret @@ -276,12 +276,12 @@ define void @masked_store_v64i32(<64 x i32> %val, ptr %a, <64 x i1> %mask) { ; CHECK-LABEL: masked_store_v64i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 4 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vse32.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 4 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vse32.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v64i32.p0(<64 x i32> %val, ptr %a, i32 8, <64 x i1> %mask) @@ -303,12 +303,12 @@ define void @masked_store_v128i16(<128 x i16> %val, ptr %a, <128 x i1> %mask) { ; CHECK-LABEL: masked_store_v128i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 8 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vse16.v v16, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.masked.store.v128i16.p0(<128 x i16> %val, ptr %a, i32 8, <128 x i1> %mask) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll index 80a9143d1ad8b..68e218fcad062 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll @@ -9,19 +9,19 @@ declare <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_nearbyint_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI0_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl) ret <2 x half> %v @@ -30,17 +30,17 @@ define <2 x half> @vp_nearbyint_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext % define <2 x half> @vp_nearbyint_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI1_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x half> %v @@ -51,19 +51,19 @@ declare <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_nearbyint_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI2_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl) ret <4 x half> %v @@ -72,17 +72,17 @@ define <4 x half> @vp_nearbyint_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext % define <4 x half> @vp_nearbyint_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI3_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x half> %v @@ -93,19 +93,19 @@ declare <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_nearbyint_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI4_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl) ret <8 x half> %v @@ -114,17 +114,17 @@ define <8 x half> @vp_nearbyint_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext % define <8 x half> @vp_nearbyint_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI5_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x half> %v @@ -137,19 +137,19 @@ define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI6_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI6_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl) ret <16 x half> %v @@ -158,17 +158,17 @@ define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe define <16 x half> @vp_nearbyint_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI7_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI7_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x half> %v @@ -183,15 +183,15 @@ define <2 x float> @vp_nearbyint_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <2 x float> @llvm.vp.nearbyint.v2f32(<2 x float> %va, <2 x i1> %m, i32 %evl) ret <2 x float> %v @@ -204,13 +204,13 @@ define <2 x float> @vp_nearbyint_v2f32_unmasked(<2 x float> %va, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <2 x float> @llvm.vp.nearbyint.v2f32(<2 x float> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x float> %v @@ -225,15 +225,15 @@ define <4 x float> @vp_nearbyint_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <4 x float> @llvm.vp.nearbyint.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl) ret <4 x float> %v @@ -246,13 +246,13 @@ define <4 x float> @vp_nearbyint_v4f32_unmasked(<4 x float> %va, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <4 x float> @llvm.vp.nearbyint.v4f32(<4 x float> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x float> %v @@ -268,16 +268,16 @@ define <8 x float> @vp_nearbyint_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl) ret <8 x float> %v @@ -290,13 +290,13 @@ define <8 x float> @vp_nearbyint_v8f32_unmasked(<8 x float> %va, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x float> %v @@ -312,16 +312,16 @@ define <16 x float> @vp_nearbyint_v16f32(<16 x float> %va, <16 x i1> %m, i32 zer ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl) ret <16 x float> %v @@ -334,13 +334,13 @@ define <16 x float> @vp_nearbyint_v16f32_unmasked(<16 x float> %va, i32 zeroext ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x float> %v @@ -351,19 +351,19 @@ declare <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_nearbyint_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI16_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl) ret <2 x double> %v @@ -372,17 +372,17 @@ define <2 x double> @vp_nearbyint_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroe define <2 x double> @vp_nearbyint_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x double> %v @@ -395,19 +395,19 @@ define <4 x double> @vp_nearbyint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl) ret <4 x double> %v @@ -416,17 +416,17 @@ define <4 x double> @vp_nearbyint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe define <4 x double> @vp_nearbyint_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x double> %v @@ -439,19 +439,19 @@ define <8 x double> @vp_nearbyint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl) ret <8 x double> %v @@ -460,17 +460,17 @@ define <8 x double> @vp_nearbyint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe define <8 x double> @vp_nearbyint_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI21_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x double> %v @@ -483,19 +483,19 @@ define <15 x double> @vp_nearbyint_v15f64(<15 x double> %va, <15 x i1> %m, i32 z ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl) ret <15 x double> %v @@ -504,17 +504,17 @@ define <15 x double> @vp_nearbyint_v15f64(<15 x double> %va, <15 x i1> %m, i32 z define <15 x double> @vp_nearbyint_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v15f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI23_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl) ret <15 x double> %v @@ -527,19 +527,19 @@ define <16 x double> @vp_nearbyint_v16f64(<16 x double> %va, <16 x i1> %m, i32 z ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl) ret <16 x double> %v @@ -548,17 +548,17 @@ define <16 x double> @vp_nearbyint_v16f64(<16 x double> %va, <16 x i1> %m, i32 z define <16 x double> @vp_nearbyint_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v16f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI25_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x double> %v @@ -569,59 +569,69 @@ declare <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v32f64: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v6, v0 +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v7, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: lui a2, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: lui a1, %hi(.LCPI26_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; CHECK-NEXT: addi a1, a0, -16 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: frflags a1 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: sltu a0, a0, a2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: frflags a1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -664,9 +674,9 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsflags a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: fsflags a1 ; CHECK-NEXT: ret %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl) ret <32 x double> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll index bf8baafc4a25d..ff6984eb82df1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll @@ -122,9 +122,9 @@ define i32 @reduce_sum_16xi32_prefix3(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix3: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -140,9 +140,9 @@ define i32 @reduce_sum_16xi32_prefix4(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix4: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredsum.vs v8, v8, v9 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -160,9 +160,9 @@ define i32 @reduce_sum_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredsum.vs v8, v8, v10 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vredsum.vs v8, v10, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -182,9 +182,9 @@ define i32 @reduce_sum_16xi32_prefix6(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix6: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredsum.vs v8, v8, v10 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vredsum.vs v8, v10, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -206,9 +206,9 @@ define i32 @reduce_sum_16xi32_prefix7(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix7: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredsum.vs v8, v8, v10 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vredsum.vs v8, v10, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -232,9 +232,9 @@ define i32 @reduce_sum_16xi32_prefix8(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredsum.vs v8, v8, v10 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vredsum.vs v8, v10, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -492,9 +492,9 @@ define i32 @reduce_xor_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_xor_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredxor.vs v8, v8, v10 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vredxor.vs v8, v10, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -568,9 +568,9 @@ define i32 @reduce_or_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_or_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredor.vs v8, v8, v10 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vredor.vs v8, v10, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -609,11 +609,11 @@ define i32 @reduce_smax_16xi32_prefix2(ptr %p) { define i32 @reduce_smax_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_smax_16xi32_prefix5: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 524288 ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vredmax.vs v8, v8, v10 +; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vredmax.vs v8, v10, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -687,9 +687,9 @@ define i32 @reduce_umax_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_umax_16xi32_prefix5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredmaxu.vs v8, v8, v10 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vredmaxu.vs v8, v10, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -734,11 +734,11 @@ define i32 @reduce_umin_16xi32_prefix5(ptr %p) { ; ; RV64-LABEL: reduce_umin_16xi32_prefix5: ; RV64: # %bb.0: +; RV64-NEXT: li a1, -1 ; RV64-NEXT: vsetivli zero, 5, e32, m2, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: li a0, -1 -; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vredminu.vs v8, v8, v10 +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vle32.v v10, (a0) +; RV64-NEXT: vredminu.vs v8, v10, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -758,9 +758,9 @@ define float @reduce_fadd_16xf32_prefix2(ptr %p) { ; CHECK-LABEL: reduce_fadd_16xf32_prefix2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vfredusum.vs v8, v8, v9 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vfredusum.vs v8, v9, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %p, align 256 @@ -773,11 +773,11 @@ define float @reduce_fadd_16xf32_prefix2(ptr %p) { define float @reduce_fadd_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_fadd_16xi32_prefix5: ; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 524288 ; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vfredusum.vs v8, v8, v10 +; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vfredusum.vs v8, v10, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %p, align 256 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll index 6684e6d223eac..c2cac3eeb7a46 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll @@ -121,9 +121,9 @@ declare float @llvm.vp.reduce.fadd.v64f32(float, <64 x float>, <64 x i1>, i32) define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_fadd_v64f32: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 4 +; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB8_2 ; CHECK-NEXT: # %bb.1: @@ -149,9 +149,9 @@ define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32 define float @vpreduce_ord_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_ord_fadd_v64f32: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 4 +; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB9_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index 2b279389253b0..23197ede1da49 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -253,11 +253,11 @@ define half @vreduce_ord_fadd_v128f16(ptr %x, half %s) { ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vfmv.s.f v24, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v24 -; CHECK-NEXT: vfredosum.vs v8, v16, v8 +; CHECK-NEXT: vfredosum.vs v16, v16, v24 +; CHECK-NEXT: vfredosum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <128 x half>, ptr %x @@ -744,11 +744,11 @@ define float @vreduce_ord_fadd_v64f32(ptr %x, float %s) { ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vfmv.s.f v24, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v24 -; CHECK-NEXT: vfredosum.vs v8, v16, v8 +; CHECK-NEXT: vfredosum.vs v16, v16, v24 +; CHECK-NEXT: vfredosum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <64 x float>, ptr %x @@ -1135,11 +1135,11 @@ define double @vreduce_ord_fadd_v32f64(ptr %x, double %s) { ; CHECK: # %bb.0: ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vle64.v v16, (a0) ; CHECK-NEXT: vfmv.s.f v24, fa0 -; CHECK-NEXT: vfredosum.vs v8, v8, v24 -; CHECK-NEXT: vfredosum.vs v8, v16, v8 +; CHECK-NEXT: vfredosum.vs v16, v16, v24 +; CHECK-NEXT: vfredosum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x double>, ptr %x @@ -1344,17 +1344,17 @@ define float @vreduce_fmin_v128f32(ptr %x) { ; CHECK-LABEL: vreduce_fmin_v128f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmin.vv v8, v24, v8 -; CHECK-NEXT: vfmin.vv v16, v16, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vfmin.vv v24, v0, v24 ; CHECK-NEXT: vfmin.vv v8, v16, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v24 ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1591,17 +1591,17 @@ define float @vreduce_fmax_v128f32(ptr %x) { ; CHECK-LABEL: vreduce_fmax_v128f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmax.vv v8, v24, v8 -; CHECK-NEXT: vfmax.vv v16, v16, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vfmax.vv v24, v0, v24 ; CHECK-NEXT: vfmax.vv v8, v16, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v24 ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -1997,59 +1997,56 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: addi a2, a0, 384 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v16, (a2) +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v8, v16, v8 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v24, v24, v8 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 ; CHECK-NEXT: beqz a0, .LBB121_2 @@ -2077,17 +2074,17 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v128f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmin.vv v8, v24, v8 -; CHECK-NEXT: vfmin.vv v16, v16, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vfmin.vv v24, v0, v24 ; CHECK-NEXT: vfmin.vv v8, v16, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v24 ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2245,42 +2242,25 @@ declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>) define double @vreduce_fminimum_v32f64(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 +; CHECK-NEXT: vfmin.vv v8, v8, v24 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 ; CHECK-NEXT: beqz a0, .LBB131_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, %hi(.LCPI131_0) ; CHECK-NEXT: fld fa0, %lo(.LCPI131_0)(a0) -; CHECK-NEXT: j .LBB131_3 +; CHECK-NEXT: ret ; CHECK-NEXT: .LBB131_2: ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB131_3: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = load <32 x double>, ptr %x %red = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %v) @@ -2314,59 +2294,56 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: addi a1, a0, 384 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v8, v16, v8 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 +; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v24, v24, v8 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 ; CHECK-NEXT: beqz a0, .LBB133_2 @@ -2395,15 +2372,15 @@ define double @vreduce_fminimum_v64f64_nonans(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: vle64.v v0, (a1) -; CHECK-NEXT: vfmin.vv v16, v24, v16 -; CHECK-NEXT: vfmin.vv v8, v8, v0 +; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vle64.v v0, (a0) +; CHECK-NEXT: vfmin.vv v24, v0, v24 ; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfmin.vv v8, v8, v24 ; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2711,59 +2688,56 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: addi a2, a0, 384 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vle32.v v16, (a2) +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v8, v16, v8 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmfeq.vv v0, v8, v8 ; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v24, v24, v8 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 ; CHECK-NEXT: beqz a0, .LBB149_2 @@ -2791,17 +2765,17 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v128f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a2) -; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vfmax.vv v8, v24, v8 -; CHECK-NEXT: vfmax.vv v16, v16, v0 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vfmax.vv v24, v0, v24 ; CHECK-NEXT: vfmax.vv v8, v16, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v24 ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -2959,42 +2933,25 @@ declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>) define double @vreduce_fmaximum_v32f64(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 +; CHECK-NEXT: vfmax.vv v8, v8, v24 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 ; CHECK-NEXT: beqz a0, .LBB159_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, %hi(.LCPI159_0) ; CHECK-NEXT: fld fa0, %lo(.LCPI159_0)(a0) -; CHECK-NEXT: j .LBB159_3 +; CHECK-NEXT: ret ; CHECK-NEXT: .LBB159_2: ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB159_3: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = load <32 x double>, ptr %x %red = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %v) @@ -3028,59 +2985,56 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: addi a1, a0, 384 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v8, v16, v8 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmfeq.vv v0, v24, v24 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 +; CHECK-NEXT: vmerge.vvm v16, v24, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v24, v24, v8 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 ; CHECK-NEXT: beqz a0, .LBB161_2 @@ -3109,15 +3063,15 @@ define double @vreduce_fmaximum_v64f64_nonans(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: vle64.v v0, (a1) -; CHECK-NEXT: vfmax.vv v16, v24, v16 -; CHECK-NEXT: vfmax.vv v8, v8, v0 +; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: vle64.v v0, (a0) +; CHECK-NEXT: vfmax.vv v24, v0, v24 ; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfmax.vv v8, v8, v24 ; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll index f920e39e7d295..8f61f314cf71b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -801,9 +801,9 @@ declare i32 @llvm.vp.reduce.xor.v64i32(i32, <64 x i32>, <64 x i1>, i32) define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpreduce_xor_v64i32: ; CHECK: # %bb.0: -; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 4 +; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: bltu a1, a3, .LBB49_2 ; CHECK-NEXT: # %bb.1: @@ -1575,10 +1575,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vid.v v10 ; RV32-NEXT: vmsltu.vx v9, v10, a1 -; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32-NEXT: vmv.v.i v9, 1 -; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV32-NEXT: vmv.v.i v10, 1 +; RV32-NEXT: vmand.mm v0, v9, v0 +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV32-NEXT: vslidedown.vi v9, v8, 4 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: vslidedown.vi v9, v8, 2 @@ -1606,10 +1606,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3 ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64-NEXT: vid.v v10 ; RV64-NEXT: vmsltu.vx v9, v10, a1 -; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV64-NEXT: vmv.v.i v9, 1 -; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV64-NEXT: vmv.v.i v10, 1 +; RV64-NEXT: vmand.mm v0, v9, v0 +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV64-NEXT: vslidedown.vi v9, v8, 4 ; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: vslidedown.vi v9, v8, 2 @@ -1643,10 +1643,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m, ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vid.v v12 ; RV32-NEXT: vmsltu.vx v9, v12, a1 -; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, 1 -; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV32-NEXT: vmv.v.i v10, 1 +; RV32-NEXT: vmand.mm v0, v9, v0 +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV32-NEXT: vslidedown.vi v9, v8, 8 ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: vslidedown.vi v9, v8, 4 @@ -1676,10 +1676,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m, ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vid.v v12 ; RV64-NEXT: vmsltu.vx v9, v12, a1 -; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; RV64-NEXT: vmv.v.i v9, 1 -; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV64-NEXT: vmv.v.i v10, 1 +; RV64-NEXT: vmand.mm v0, v9, v0 +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV64-NEXT: vslidedown.vi v9, v8, 8 ; RV64-NEXT: vmul.vv v8, v8, v9 ; RV64-NEXT: vslidedown.vi v9, v8, 4 @@ -1716,10 +1716,10 @@ define signext i8 @vpreduce_mul_v32i8(i8 signext %s, <32 x i8> %v, <32 x i1> %m, ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV32-NEXT: vid.v v16 ; RV32-NEXT: vmsltu.vx v10, v16, a1 -; RV32-NEXT: vmand.mm v0, v10, v0 ; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; RV32-NEXT: vmv.v.i v10, 1 -; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV32-NEXT: vmv.v.i v12, 1 +; RV32-NEXT: vmand.mm v0, v10, v0 +; RV32-NEXT: vmerge.vvm v8, v12, v8, v0 ; RV32-NEXT: vslidedown.vi v10, v8, 16 ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: vslidedown.vi v10, v8, 8 @@ -1752,10 +1752,10 @@ define signext i8 @vpreduce_mul_v32i8(i8 signext %s, <32 x i8> %v, <32 x i1> %m, ; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; RV64-NEXT: vid.v v16 ; RV64-NEXT: vmsltu.vx v10, v16, a1 -; RV64-NEXT: vmand.mm v0, v10, v0 ; RV64-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; RV64-NEXT: vmv.v.i v10, 1 -; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV64-NEXT: vmv.v.i v12, 1 +; RV64-NEXT: vmand.mm v0, v10, v0 +; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 ; RV64-NEXT: vslidedown.vi v10, v8, 16 ; RV64-NEXT: vmul.vv v8, v8, v10 ; RV64-NEXT: vslidedown.vi v10, v8, 8 @@ -1794,18 +1794,19 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, ; RV32-NEXT: lui a3, %hi(.LCPI72_0) ; RV32-NEXT: addi a3, a3, %lo(.LCPI72_0) ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vle8.v v12, (a3) ; RV32-NEXT: vid.v v16 -; RV32-NEXT: vmsltu.vx v14, v16, a1 -; RV32-NEXT: li a3, 64 -; RV32-NEXT: vsext.vf4 v16, v12 ; RV32-NEXT: vmsltu.vx v12, v16, a1 +; RV32-NEXT: vle8.v v14, (a3) +; RV32-NEXT: li a3, 64 +; RV32-NEXT: vsext.vf4 v16, v14 +; RV32-NEXT: vmsltu.vx v13, v16, a1 +; RV32-NEXT: vsetvli zero, a3, e8, m4, ta, ma +; RV32-NEXT: vmv.v.i v16, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslideup.vi v14, v12, 4 +; RV32-NEXT: vslideup.vi v12, v13, 4 ; RV32-NEXT: vsetvli zero, a3, e8, m4, ta, ma -; RV32-NEXT: vmand.mm v0, v14, v0 -; RV32-NEXT: vmv.v.i v12, 1 -; RV32-NEXT: vmerge.vvm v8, v12, v8, v0 +; RV32-NEXT: vmand.mm v0, v12, v0 +; RV32-NEXT: vmerge.vvm v8, v16, v8, v0 ; RV32-NEXT: vslidedown.vx v12, v8, a0 ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 16 @@ -1840,18 +1841,19 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, ; RV64-NEXT: lui a3, %hi(.LCPI72_0) ; RV64-NEXT: addi a3, a3, %lo(.LCPI72_0) ; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV64-NEXT: vle8.v v12, (a3) ; RV64-NEXT: vid.v v16 -; RV64-NEXT: vmsltu.vx v14, v16, a1 -; RV64-NEXT: li a3, 64 -; RV64-NEXT: vsext.vf4 v16, v12 ; RV64-NEXT: vmsltu.vx v12, v16, a1 +; RV64-NEXT: vle8.v v14, (a3) +; RV64-NEXT: li a3, 64 +; RV64-NEXT: vsext.vf4 v16, v14 +; RV64-NEXT: vmsltu.vx v13, v16, a1 +; RV64-NEXT: vsetvli zero, a3, e8, m4, ta, ma +; RV64-NEXT: vmv.v.i v16, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslideup.vi v14, v12, 4 +; RV64-NEXT: vslideup.vi v12, v13, 4 ; RV64-NEXT: vsetvli zero, a3, e8, m4, ta, ma -; RV64-NEXT: vmand.mm v0, v14, v0 -; RV64-NEXT: vmv.v.i v12, 1 -; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 +; RV64-NEXT: vmand.mm v0, v12, v0 +; RV64-NEXT: vmerge.vvm v8, v16, v8, v0 ; RV64-NEXT: vslidedown.vx v12, v8, a0 ; RV64-NEXT: vmul.vv v8, v8, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll index 707d1202aca0f..c3c657c96c92a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -1471,14 +1471,14 @@ declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>) define i64 @vreduce_add_v64i64(ptr %x) nounwind { ; RV32-LABEL: vreduce_add_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v24, (a1) -; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: vle64.v v0, (a1) ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a0, a0, 256 -; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 +; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vadd.vv v24, v0, v24 ; RV32-NEXT: vmv.s.x v7, zero ; RV32-NEXT: li a1, 32 @@ -1495,15 +1495,15 @@ define i64 @vreduce_add_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vadd.vv v16, v24, v16 -; RV64-NEXT: vadd.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vadd.vv v24, v0, v24 ; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vadd.vv v8, v8, v24 ; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 @@ -1519,18 +1519,18 @@ define i64 @vwreduce_add_v64i64(ptr %x) { ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a0) -; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vle32.v v24, (a0) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v0, v16, 16 +; RV32-NEXT: vslidedown.vi v0, v24, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwadd.vv v24, v16, v8 +; RV32-NEXT: vwadd.vv v8, v24, v16 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwadd.vv v16, v0, v8 +; RV32-NEXT: vwadd.vv v24, v0, v16 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v24, v16 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vmv.s.x v16, zero ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 @@ -1550,15 +1550,15 @@ define i64 @vwreduce_add_v64i64(ptr %x) { ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: vle32.v v8, (a1) +; RV64-NEXT: vle32.v v16, (a0) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-NEXT: vslidedown.vi v0, v16, 16 +; RV64-NEXT: vslidedown.vi v0, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwadd.vv v24, v8, v16 +; RV64-NEXT: vwadd.vv v24, v16, v8 ; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vwadd.vv v8, v16, v0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma @@ -1585,18 +1585,18 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a0) -; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vle32.v v24, (a0) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v0, v16, 16 +; RV32-NEXT: vslidedown.vi v0, v24, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwaddu.vv v24, v16, v8 +; RV32-NEXT: vwaddu.vv v8, v24, v16 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwaddu.vv v16, v0, v8 +; RV32-NEXT: vwaddu.vv v24, v0, v16 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vadd.vv v8, v24, v16 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vmv.s.x v16, zero ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 @@ -1616,15 +1616,15 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: vle32.v v8, (a1) +; RV64-NEXT: vle32.v v16, (a0) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: addi a0, sp, 16 ; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV64-NEXT: vslidedown.vi v0, v16, 16 +; RV64-NEXT: vslidedown.vi v0, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwaddu.vv v24, v8, v16 +; RV64-NEXT: vwaddu.vv v24, v16, v8 ; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV64-NEXT: vwaddu.vv v8, v16, v0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma @@ -2201,16 +2201,16 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v0, v24 ; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vredand.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2222,15 +2222,15 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vand.vv v16, v24, v16 -; RV64-NEXT: vand.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vand.vv v24, v0, v24 ; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: vredand.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -2793,16 +2793,16 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v24, v0, v24 ; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vredor.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -2814,15 +2814,15 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vor.vv v16, v24, v16 -; RV64-NEXT: vor.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vor.vv v24, v0, v24 ; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: vredor.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -3414,14 +3414,14 @@ declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>) define i64 @vreduce_xor_v64i64(ptr %x) nounwind { ; RV32-LABEL: vreduce_xor_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v24, (a1) -; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: vle64.v v0, (a1) ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a0, a0, 256 -; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 +; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vxor.vv v24, v0, v24 ; RV32-NEXT: vmv.s.x v7, zero ; RV32-NEXT: li a1, 32 @@ -3438,15 +3438,15 @@ define i64 @vreduce_xor_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vxor.vv v16, v24, v16 -; RV64-NEXT: vxor.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vxor.vv v24, v0, v24 ; RV64-NEXT: vxor.vv v8, v8, v16 +; RV64-NEXT: vxor.vv v8, v8, v24 ; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vredxor.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 @@ -4011,16 +4011,16 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vmin.vv v16, v0, v16 -; RV32-NEXT: vmin.vv v8, v8, v24 +; RV32-NEXT: vmin.vv v24, v0, v24 ; RV32-NEXT: vmin.vv v8, v8, v16 +; RV32-NEXT: vmin.vv v8, v8, v24 ; RV32-NEXT: vredmin.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -4032,15 +4032,15 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vmin.vv v16, v24, v16 -; RV64-NEXT: vmin.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vmin.vv v24, v0, v24 ; RV64-NEXT: vmin.vv v8, v8, v16 +; RV64-NEXT: vmin.vv v8, v8, v24 ; RV64-NEXT: vredmin.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -4604,16 +4604,16 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vmax.vv v16, v0, v16 -; RV32-NEXT: vmax.vv v8, v8, v24 +; RV32-NEXT: vmax.vv v24, v0, v24 ; RV32-NEXT: vmax.vv v8, v8, v16 +; RV32-NEXT: vmax.vv v8, v8, v24 ; RV32-NEXT: vredmax.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -4625,15 +4625,15 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vmax.vv v16, v24, v16 -; RV64-NEXT: vmax.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vmax.vv v24, v0, v24 ; RV64-NEXT: vmax.vv v8, v8, v16 +; RV64-NEXT: vmax.vv v8, v8, v24 ; RV64-NEXT: vredmax.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -5197,16 +5197,16 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vminu.vv v16, v0, v16 -; RV32-NEXT: vminu.vv v8, v8, v24 +; RV32-NEXT: vminu.vv v24, v0, v24 ; RV32-NEXT: vminu.vv v8, v8, v16 +; RV32-NEXT: vminu.vv v8, v8, v24 ; RV32-NEXT: vredminu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -5218,15 +5218,15 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vminu.vv v16, v24, v16 -; RV64-NEXT: vminu.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vminu.vv v24, v0, v24 ; RV64-NEXT: vminu.vv v8, v8, v16 +; RV64-NEXT: vminu.vv v8, v8, v24 ; RV64-NEXT: vredminu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -5789,16 +5789,16 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vmaxu.vv v16, v0, v16 -; RV32-NEXT: vmaxu.vv v8, v8, v24 +; RV32-NEXT: vmaxu.vv v24, v0, v24 ; RV32-NEXT: vmaxu.vv v8, v8, v16 +; RV32-NEXT: vmaxu.vv v8, v8, v24 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma @@ -5810,15 +5810,15 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vmaxu.vv v16, v24, v16 -; RV64-NEXT: vmaxu.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vmaxu.vv v24, v0, v24 ; RV64-NEXT: vmaxu.vv v8, v8, v16 +; RV64-NEXT: vmaxu.vv v8, v8, v24 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret @@ -6585,15 +6585,15 @@ define i64 @vreduce_mul_v64i64(ptr %x) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a1, a0, 384 -; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 384 ; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v24, (a0) -; RV32-NEXT: vle64.v v0, (a1) -; RV32-NEXT: vmul.vv v16, v24, v16 -; RV32-NEXT: vmul.vv v8, v8, v0 +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vle64.v v0, (a0) +; RV32-NEXT: vmul.vv v24, v0, v24 ; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: vslidedown.vi v16, v8, 8 ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 4 @@ -6612,15 +6612,15 @@ define i64 @vreduce_mul_v64i64(ptr %x) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a1, a0, 384 -; RV64-NEXT: vle64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 384 ; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: vle64.v v0, (a1) -; RV64-NEXT: vmul.vv v16, v24, v16 -; RV64-NEXT: vmul.vv v8, v8, v0 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vle64.v v0, (a0) +; RV64-NEXT: vmul.vv v24, v0, v24 ; RV64-NEXT: vmul.vv v8, v8, v16 +; RV64-NEXT: vmul.vv v8, v8, v24 ; RV64-NEXT: vslidedown.vi v16, v8, 8 ; RV64-NEXT: vmul.vv v8, v8, v16 ; RV64-NEXT: vslidedown.vi v16, v8, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll index 266772d36ee9c..70555bd6c09e2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll @@ -519,8 +519,8 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vmv1r.v v6, v0 -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -542,11 +542,11 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll index a4ff079846fd8..d35637401dd66 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll @@ -13,13 +13,13 @@ declare <2 x half> @llvm.vp.round.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -35,12 +35,12 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v11, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v11, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -59,12 +59,12 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -77,11 +77,11 @@ define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -99,13 +99,13 @@ declare <4 x half> @llvm.vp.round.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -121,12 +121,12 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v11, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v11, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vmv.v.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -145,12 +145,12 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -163,11 +163,11 @@ define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -185,13 +185,13 @@ declare <8 x half> @llvm.vp.round.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -207,12 +207,12 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v12, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v12, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v12, v10, v0.t @@ -231,12 +231,12 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -249,11 +249,11 @@ define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -273,12 +273,12 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmv1r.v v10, v0 +; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) -; ZVFH-NEXT: vfabs.v v12, v8, v0.t +; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vmv1r.v v0, v10 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -295,12 +295,12 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v12, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v16, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v12, v0.t @@ -319,12 +319,12 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -337,11 +337,11 @@ define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -363,9 +363,9 @@ define <2 x float> @vp_round_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -384,8 +384,8 @@ define <2 x float> @vp_round_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -405,9 +405,9 @@ define <4 x float> @vp_round_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -426,8 +426,8 @@ define <4 x float> @vp_round_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -448,9 +448,9 @@ define <8 x float> @vp_round_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -470,8 +470,8 @@ define <8 x float> @vp_round_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -492,9 +492,9 @@ define <16 x float> @vp_round_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -514,8 +514,8 @@ define <16 x float> @vp_round_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -531,13 +531,13 @@ declare <2 x double> @llvm.vp.round.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_round_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI16_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -552,12 +552,12 @@ define <2 x double> @vp_round_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext % define <2 x double> @vp_round_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -575,12 +575,12 @@ define <4 x double> @vp_round_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -596,12 +596,12 @@ define <4 x double> @vp_round_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext % define <4 x double> @vp_round_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -619,12 +619,12 @@ define <8 x double> @vp_round_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -640,12 +640,12 @@ define <8 x double> @vp_round_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext % define <8 x double> @vp_round_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI21_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -663,12 +663,12 @@ define <15 x double> @vp_round_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -684,12 +684,12 @@ define <15 x double> @vp_round_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroe define <15 x double> @vp_round_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v15f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI23_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -707,12 +707,12 @@ define <16 x double> @vp_round_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -728,12 +728,12 @@ define <16 x double> @vp_round_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroe define <16 x double> @vp_round_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v16f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI25_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -762,8 +762,8 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -778,33 +778,33 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsrmi a1, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t -; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll index c28d5fb1a8193..addb76b0bea7a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll @@ -13,13 +13,13 @@ declare <2 x half> @llvm.vp.roundeven.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -35,12 +35,12 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext % ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v11, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v11, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -59,12 +59,12 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext % define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -77,11 +77,11 @@ define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -99,13 +99,13 @@ declare <4 x half> @llvm.vp.roundeven.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -121,12 +121,12 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext % ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v11, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v11, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vmv.v.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -145,12 +145,12 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext % define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -163,11 +163,11 @@ define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -185,13 +185,13 @@ declare <8 x half> @llvm.vp.roundeven.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -207,12 +207,12 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext % ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v12, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v12, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v12, v10, v0.t @@ -231,12 +231,12 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext % define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -249,11 +249,11 @@ define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -273,12 +273,12 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmv1r.v v10, v0 +; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) -; ZVFH-NEXT: vfabs.v v12, v8, v0.t +; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vmv1r.v v0, v10 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -295,12 +295,12 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v12, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v16, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v12, v0.t @@ -319,12 +319,12 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -337,11 +337,11 @@ define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %e ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -363,9 +363,9 @@ define <2 x float> @vp_roundeven_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -384,8 +384,8 @@ define <2 x float> @vp_roundeven_v2f32_unmasked(<2 x float> %va, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -405,9 +405,9 @@ define <4 x float> @vp_roundeven_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -426,8 +426,8 @@ define <4 x float> @vp_roundeven_v4f32_unmasked(<4 x float> %va, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -448,9 +448,9 @@ define <8 x float> @vp_roundeven_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -470,8 +470,8 @@ define <8 x float> @vp_roundeven_v8f32_unmasked(<8 x float> %va, i32 zeroext %ev ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -492,9 +492,9 @@ define <16 x float> @vp_roundeven_v16f32(<16 x float> %va, <16 x i1> %m, i32 zer ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -514,8 +514,8 @@ define <16 x float> @vp_roundeven_v16f32_unmasked(<16 x float> %va, i32 zeroext ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -531,13 +531,13 @@ declare <2 x double> @llvm.vp.roundeven.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_roundeven_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI16_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -552,12 +552,12 @@ define <2 x double> @vp_roundeven_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroe define <2 x double> @vp_roundeven_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -575,12 +575,12 @@ define <4 x double> @vp_roundeven_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -596,12 +596,12 @@ define <4 x double> @vp_roundeven_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe define <4 x double> @vp_roundeven_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -619,12 +619,12 @@ define <8 x double> @vp_roundeven_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -640,12 +640,12 @@ define <8 x double> @vp_roundeven_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe define <8 x double> @vp_roundeven_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI21_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -663,12 +663,12 @@ define <15 x double> @vp_roundeven_v15f64(<15 x double> %va, <15 x i1> %m, i32 z ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -684,12 +684,12 @@ define <15 x double> @vp_roundeven_v15f64(<15 x double> %va, <15 x i1> %m, i32 z define <15 x double> @vp_roundeven_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v15f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI23_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -707,12 +707,12 @@ define <16 x double> @vp_roundeven_v16f64(<16 x double> %va, <16 x i1> %m, i32 z ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -728,12 +728,12 @@ define <16 x double> @vp_roundeven_v16f64(<16 x double> %va, <16 x i1> %m, i32 z define <16 x double> @vp_roundeven_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v16f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI25_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -762,8 +762,8 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -778,33 +778,33 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: fsrmi a1, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a1, 0 ; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsrmi a1, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t -; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll index 64d3664a4c372..bac25bcfec01d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll @@ -13,13 +13,13 @@ declare <2 x half> @llvm.vp.roundtozero.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -35,12 +35,12 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v11, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v11, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -59,12 +59,12 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: fsrmi a0, 1 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -77,11 +77,11 @@ define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %ev ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -99,13 +99,13 @@ declare <4 x half> @llvm.vp.roundtozero.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -121,12 +121,12 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v11, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v11, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vmv.v.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -145,12 +145,12 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: fsrmi a0, 1 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -163,11 +163,11 @@ define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %ev ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -185,13 +185,13 @@ declare <8 x half> @llvm.vp.roundtozero.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -207,12 +207,12 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v12, v10, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v12, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v12, v10, v0.t @@ -231,12 +231,12 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; ZVFH-NEXT: fsrmi a0, 1 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -249,11 +249,11 @@ define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %ev ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -273,12 +273,12 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmv1r.v v10, v0 +; ZVFH-NEXT: vfabs.v v12, v8, v0.t ; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) -; ZVFH-NEXT: vfabs.v v12, v8, v0.t +; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vmv1r.v v0, v10 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -295,12 +295,12 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v12, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v16, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v12, v0.t @@ -319,12 +319,12 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: fsrmi a0, 1 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -337,11 +337,11 @@ define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -363,9 +363,9 @@ define <2 x float> @vp_roundtozero_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroe ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -384,8 +384,8 @@ define <2 x float> @vp_roundtozero_v2f32_unmasked(<2 x float> %va, i32 zeroext % ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -405,9 +405,9 @@ define <4 x float> @vp_roundtozero_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroe ; CHECK-NEXT: vfabs.v v9, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -426,8 +426,8 @@ define <4 x float> @vp_roundtozero_v4f32_unmasked(<4 x float> %va, i32 zeroext % ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -448,9 +448,9 @@ define <8 x float> @vp_roundtozero_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroe ; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -470,8 +470,8 @@ define <8 x float> @vp_roundtozero_v8f32_unmasked(<8 x float> %va, i32 zeroext % ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -492,9 +492,9 @@ define <16 x float> @vp_roundtozero_v16f32(<16 x float> %va, <16 x i1> %m, i32 z ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -514,8 +514,8 @@ define <16 x float> @vp_roundtozero_v16f32_unmasked(<16 x float> %va, i32 zeroex ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -531,13 +531,13 @@ declare <2 x double> @llvm.vp.roundtozero.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_roundtozero_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI16_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a0) +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -552,12 +552,12 @@ define <2 x double> @vp_roundtozero_v2f64(<2 x double> %va, <2 x i1> %m, i32 zer define <2 x double> @vp_roundtozero_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -575,12 +575,12 @@ define <4 x double> @vp_roundtozero_v4f64(<4 x double> %va, <4 x i1> %m, i32 zer ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI18_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -596,12 +596,12 @@ define <4 x double> @vp_roundtozero_v4f64(<4 x double> %va, <4 x i1> %m, i32 zer define <4 x double> @vp_roundtozero_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -619,12 +619,12 @@ define <8 x double> @vp_roundtozero_v8f64(<8 x double> %va, <8 x i1> %m, i32 zer ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI20_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -640,12 +640,12 @@ define <8 x double> @vp_roundtozero_v8f64(<8 x double> %va, <8 x i1> %m, i32 zer define <8 x double> @vp_roundtozero_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI21_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a0) ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -663,12 +663,12 @@ define <15 x double> @vp_roundtozero_v15f64(<15 x double> %va, <15 x i1> %m, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI22_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -684,12 +684,12 @@ define <15 x double> @vp_roundtozero_v15f64(<15 x double> %va, <15 x i1> %m, i32 define <15 x double> @vp_roundtozero_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v15f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI23_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a0) ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -707,12 +707,12 @@ define <16 x double> @vp_roundtozero_v16f64(<16 x double> %va, <16 x i1> %m, i32 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI24_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -728,12 +728,12 @@ define <16 x double> @vp_roundtozero_v16f64(<16 x double> %va, <16 x i1> %m, i32 define <16 x double> @vp_roundtozero_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v16f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI25_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -762,8 +762,8 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -778,33 +778,33 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsrmi a1, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t -; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll index 318f38839851c..034a969fc2847 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll @@ -141,36 +141,18 @@ define <32 x i32> @select_addsub_v32i32(<32 x i1> %cc, <32 x i32> %a, <32 x i32> define <64 x i32> @select_addsub_v64i32(<64 x i1> %cc, <64 x i32> %a, <64 x i32> %b) { ; CHECK-LABEL: select_addsub_v64i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vrsub.vi v24, v24, 0, v0.t ; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vadd.vv v8, v8, v24 ; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 4 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vrsub.vi v24, v24, 0, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vadd.vv v16, v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %sub = sub <64 x i32> %a, %b %add = add <64 x i32> %a, %b diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll index 03d5762b4903e..13242fc8f0d66 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -1073,19 +1073,19 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; ZVFH-NEXT: addi a1, a0, 128 ; ZVFH-NEXT: li a3, 64 +; ZVFH-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; ZVFH-NEXT: vslidedown.vi v24, v0, 8 ; ZVFH-NEXT: vsetvli zero, a3, e16, m8, ta, ma ; ZVFH-NEXT: vle16.v v16, (a1) ; ZVFH-NEXT: addi a1, sp, 16 ; ZVFH-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVFH-NEXT: mv a1, a2 ; ZVFH-NEXT: vle16.v v16, (a0) -; ZVFH-NEXT: mv a0, a2 -; ZVFH-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; ZVFH-NEXT: vslidedown.vi v24, v0, 8 ; ZVFH-NEXT: bltu a2, a3, .LBB43_2 ; ZVFH-NEXT: # %bb.1: -; ZVFH-NEXT: li a0, 64 +; ZVFH-NEXT: li a1, 64 ; ZVFH-NEXT: .LBB43_2: -; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; ZVFH-NEXT: vmfeq.vv v7, v8, v16, v0.t ; ZVFH-NEXT: addi a0, a2, -64 ; ZVFH-NEXT: sltu a1, a2, a0 @@ -1114,20 +1114,32 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ; ZVFHMIN32-LABEL: fcmp_oeq_vv_v128f16: ; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -896 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 896 -; ZVFHMIN32-NEXT: sw ra, 892(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s0, 888(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s2, 884(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s3, 880(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s4, 876(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s5, 872(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s6, 868(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s7, 864(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s8, 860(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s9, 856(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s10, 852(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: sw s11, 848(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: addi sp, sp, -1024 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 1024 +; ZVFHMIN32-NEXT: sw ra, 1020(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s0, 1016(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s2, 1012(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s3, 1008(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s4, 1004(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s5, 1000(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s6, 996(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s7, 992(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s8, 988(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s9, 984(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s10, 980(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: sw s11, 976(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs0, 968(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs1, 960(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs2, 952(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs3, 944(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs4, 936(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs5, 928(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs6, 920(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs7, 912(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs8, 904(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs9, 896(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs10, 888(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: fsd fs11, 880(sp) # 8-byte Folded Spill ; ZVFHMIN32-NEXT: .cfi_offset ra, -4 ; ZVFHMIN32-NEXT: .cfi_offset s0, -8 ; ZVFHMIN32-NEXT: .cfi_offset s2, -12 @@ -1140,1096 +1152,1175 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: .cfi_offset s9, -40 ; ZVFHMIN32-NEXT: .cfi_offset s10, -44 ; ZVFHMIN32-NEXT: .cfi_offset s11, -48 -; ZVFHMIN32-NEXT: addi s0, sp, 896 +; ZVFHMIN32-NEXT: .cfi_offset fs0, -56 +; ZVFHMIN32-NEXT: .cfi_offset fs1, -64 +; ZVFHMIN32-NEXT: .cfi_offset fs2, -72 +; ZVFHMIN32-NEXT: .cfi_offset fs3, -80 +; ZVFHMIN32-NEXT: .cfi_offset fs4, -88 +; ZVFHMIN32-NEXT: .cfi_offset fs5, -96 +; ZVFHMIN32-NEXT: .cfi_offset fs6, -104 +; ZVFHMIN32-NEXT: .cfi_offset fs7, -112 +; ZVFHMIN32-NEXT: .cfi_offset fs8, -120 +; ZVFHMIN32-NEXT: .cfi_offset fs9, -128 +; ZVFHMIN32-NEXT: .cfi_offset fs10, -136 +; ZVFHMIN32-NEXT: .cfi_offset fs11, -144 +; ZVFHMIN32-NEXT: addi s0, sp, 1024 ; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 ; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: li a2, 30 +; ZVFHMIN32-NEXT: li a2, 41 ; ZVFHMIN32-NEXT: mul a1, a1, a2 ; ZVFHMIN32-NEXT: sub sp, sp, a1 ; ZVFHMIN32-NEXT: andi sp, sp, -128 -; ZVFHMIN32-NEXT: addi a1, a0, 128 -; ZVFHMIN32-NEXT: li a2, 64 -; ZVFHMIN32-NEXT: addi a3, sp, 640 -; ZVFHMIN32-NEXT: addi a4, sp, 384 -; ZVFHMIN32-NEXT: addi a5, sp, 512 -; ZVFHMIN32-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; ZVFHMIN32-NEXT: addi a3, a0, 128 +; ZVFHMIN32-NEXT: li a1, 64 +; ZVFHMIN32-NEXT: addi a4, sp, 640 +; ZVFHMIN32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: vle16.v v24, (a3) +; ZVFHMIN32-NEXT: csrr a3, vlenb +; ZVFHMIN32-NEXT: slli a5, a3, 5 +; ZVFHMIN32-NEXT: add a3, a5, a3 +; ZVFHMIN32-NEXT: add a3, sp, a3 +; ZVFHMIN32-NEXT: addi a3, a3, 880 +; ZVFHMIN32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN32-NEXT: vle16.v v0, (a0) -; ZVFHMIN32-NEXT: addi a0, sp, 256 -; ZVFHMIN32-NEXT: vle16.v v24, (a1) -; ZVFHMIN32-NEXT: vse16.v v8, (a3) -; ZVFHMIN32-NEXT: vse16.v v0, (a4) -; ZVFHMIN32-NEXT: vse16.v v16, (a5) -; ZVFHMIN32-NEXT: vse16.v v24, (a0) -; ZVFHMIN32-NEXT: lh a0, 704(sp) +; ZVFHMIN32-NEXT: vse16.v v8, (a4) +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 7 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 5 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 6 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a3, a0, 5 +; ZVFHMIN32-NEXT: sub a0, a3, a0 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 5 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 30 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 4 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 29 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 3 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 28 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 2 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 27 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 1 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 26 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 15 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 24 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 14 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 22 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 13 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 20 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 12 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 18 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 11 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a3, a0, 3 +; ZVFHMIN32-NEXT: add a0, a3, a0 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 10 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a3, a0, 3 +; ZVFHMIN32-NEXT: sub a0, a3, a0 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 9 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a3, a0, 2 +; ZVFHMIN32-NEXT: add a0, a3, a0 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v8, v8, 8 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a3, a0, 1 +; ZVFHMIN32-NEXT: add a0, a3, a0 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: addi a0, sp, 384 +; ZVFHMIN32-NEXT: addi a3, sp, 512 +; ZVFHMIN32-NEXT: vmv.x.s a5, v16 +; ZVFHMIN32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN32-NEXT: vse16.v v0, (a0) +; ZVFHMIN32-NEXT: vse16.v v16, (a3) +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 7 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 11 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 6 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 12 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 5 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 13 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 4 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 14 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 3 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a3, a0, 4 +; ZVFHMIN32-NEXT: sub a0, a3, a0 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 2 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 4 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 1 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a3, a0, 4 +; ZVFHMIN32-NEXT: add a0, a3, a0 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 15 +; ZVFHMIN32-NEXT: addi a0, sp, 880 +; ZVFHMIN32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vslidedown.vi v4, v16, 14 +; ZVFHMIN32-NEXT: vslidedown.vi v2, v16, 13 +; ZVFHMIN32-NEXT: vslidedown.vi v24, v16, 12 +; ZVFHMIN32-NEXT: vslidedown.vi v22, v16, 11 +; ZVFHMIN32-NEXT: vslidedown.vi v20, v16, 10 +; ZVFHMIN32-NEXT: vslidedown.vi v18, v16, 9 +; ZVFHMIN32-NEXT: vslidedown.vi v16, v16, 8 +; ZVFHMIN32-NEXT: vmv.x.s a6, v0 +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v5, v0, 7 +; ZVFHMIN32-NEXT: vslidedown.vi v17, v0, 6 +; ZVFHMIN32-NEXT: vslidedown.vi v23, v0, 5 +; ZVFHMIN32-NEXT: vslidedown.vi v19, v0, 4 +; ZVFHMIN32-NEXT: vslidedown.vi v21, v0, 3 +; ZVFHMIN32-NEXT: vslidedown.vi v3, v0, 2 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v0, 1 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 1 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v8, v0, 15 +; ZVFHMIN32-NEXT: vslidedown.vi v10, v0, 14 +; ZVFHMIN32-NEXT: vslidedown.vi v12, v0, 13 +; ZVFHMIN32-NEXT: vslidedown.vi v14, v0, 12 +; ZVFHMIN32-NEXT: vslidedown.vi v26, v0, 11 +; ZVFHMIN32-NEXT: vslidedown.vi v28, v0, 10 +; ZVFHMIN32-NEXT: vslidedown.vi v30, v0, 9 +; ZVFHMIN32-NEXT: vslidedown.vi v0, v0, 8 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: li a3, 24 +; ZVFHMIN32-NEXT: mul a0, a0, a3 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vl2r.v v6, (a0) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a0, v6 +; ZVFHMIN32-NEXT: csrr a3, vlenb +; ZVFHMIN32-NEXT: li a4, 22 +; ZVFHMIN32-NEXT: mul a3, a3, a4 +; ZVFHMIN32-NEXT: add a3, sp, a3 +; ZVFHMIN32-NEXT: addi a3, a3, 880 +; ZVFHMIN32-NEXT: vl2r.v v6, (a3) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a3, v6 +; ZVFHMIN32-NEXT: csrr a4, vlenb +; ZVFHMIN32-NEXT: li a7, 20 +; ZVFHMIN32-NEXT: mul a4, a4, a7 +; ZVFHMIN32-NEXT: add a4, sp, a4 +; ZVFHMIN32-NEXT: addi a4, a4, 880 +; ZVFHMIN32-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s a7, v6 +; ZVFHMIN32-NEXT: csrr a4, vlenb +; ZVFHMIN32-NEXT: li t0, 18 +; ZVFHMIN32-NEXT: mul a4, a4, t0 +; ZVFHMIN32-NEXT: add a4, sp, a4 +; ZVFHMIN32-NEXT: addi a4, a4, 880 +; ZVFHMIN32-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s3, v6 +; ZVFHMIN32-NEXT: csrr a4, vlenb +; ZVFHMIN32-NEXT: slli t0, a4, 3 +; ZVFHMIN32-NEXT: add a4, t0, a4 +; ZVFHMIN32-NEXT: add a4, sp, a4 +; ZVFHMIN32-NEXT: addi a4, a4, 880 +; ZVFHMIN32-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s10, v6 +; ZVFHMIN32-NEXT: csrr a4, vlenb +; ZVFHMIN32-NEXT: slli t0, a4, 3 +; ZVFHMIN32-NEXT: sub a4, t0, a4 +; ZVFHMIN32-NEXT: add a4, sp, a4 +; ZVFHMIN32-NEXT: addi a4, a4, 880 +; ZVFHMIN32-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s11, v6 +; ZVFHMIN32-NEXT: csrr a4, vlenb +; ZVFHMIN32-NEXT: slli t0, a4, 2 +; ZVFHMIN32-NEXT: add a4, t0, a4 +; ZVFHMIN32-NEXT: add a4, sp, a4 +; ZVFHMIN32-NEXT: addi a4, a4, 880 +; ZVFHMIN32-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s5, v6 +; ZVFHMIN32-NEXT: csrr a4, vlenb +; ZVFHMIN32-NEXT: slli t0, a4, 1 +; ZVFHMIN32-NEXT: add a4, t0, a4 +; ZVFHMIN32-NEXT: add a4, sp, a4 +; ZVFHMIN32-NEXT: addi a4, a4, 880 +; ZVFHMIN32-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s7, v6 +; ZVFHMIN32-NEXT: addi a4, sp, 880 +; ZVFHMIN32-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vmv.x.s s9, v6 +; ZVFHMIN32-NEXT: vmv.x.s s8, v4 +; ZVFHMIN32-NEXT: vmv.x.s s6, v2 +; ZVFHMIN32-NEXT: vmv.x.s s4, v24 +; ZVFHMIN32-NEXT: vmv.x.s s2, v22 +; ZVFHMIN32-NEXT: vmv.x.s a4, v20 +; ZVFHMIN32-NEXT: vmv.x.s t0, v18 +; ZVFHMIN32-NEXT: sw t0, 120(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s t0, v16 +; ZVFHMIN32-NEXT: sw t0, 124(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s t6, v8 +; ZVFHMIN32-NEXT: vmv.x.s t0, v10 +; ZVFHMIN32-NEXT: vmv.x.s t1, v12 +; ZVFHMIN32-NEXT: vmv.x.s t2, v14 +; ZVFHMIN32-NEXT: vmv.x.s t3, v26 +; ZVFHMIN32-NEXT: vmv.x.s t4, v28 +; ZVFHMIN32-NEXT: vmv.x.s t5, v30 +; ZVFHMIN32-NEXT: fmv.h.x fs8, a2 +; ZVFHMIN32-NEXT: fmv.h.x fs7, a5 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 5 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fs6, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a5, a2, 5 +; ZVFHMIN32-NEXT: sub a2, a5, a2 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fs5, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a5, 30 +; ZVFHMIN32-NEXT: mul a2, a2, a5 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x ft10, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a5, 29 +; ZVFHMIN32-NEXT: mul a2, a2, a5 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x ft8, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a5, 28 +; ZVFHMIN32-NEXT: mul a2, a2, a5 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x ft2, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a5, 27 +; ZVFHMIN32-NEXT: mul a2, a2, a5 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x ft3, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a5, 26 +; ZVFHMIN32-NEXT: mul a2, a2, a5 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x ft4, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a5, 11 +; ZVFHMIN32-NEXT: mul a2, a2, a5 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x ft5, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a5, 12 +; ZVFHMIN32-NEXT: mul a2, a2, a5 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x ft6, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a5, 13 +; ZVFHMIN32-NEXT: mul a2, a2, a5 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa6, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: li a5, 14 +; ZVFHMIN32-NEXT: mul a2, a2, a5 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fs0, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a5, a2, 4 +; ZVFHMIN32-NEXT: sub a2, a5, a2 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fs1, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a2, a2, 4 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fs2, a2 +; ZVFHMIN32-NEXT: csrr a2, vlenb +; ZVFHMIN32-NEXT: slli a5, a2, 4 +; ZVFHMIN32-NEXT: add a2, a5, a2 +; ZVFHMIN32-NEXT: add a2, sp, a2 +; ZVFHMIN32-NEXT: lh a2, 880(a2) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fs3, a2 +; ZVFHMIN32-NEXT: addi a2, sp, 256 +; ZVFHMIN32-NEXT: fmv.h.x fs4, a0 +; ZVFHMIN32-NEXT: fmv.h.x ft7, a3 +; ZVFHMIN32-NEXT: fmv.h.x ft11, a7 +; ZVFHMIN32-NEXT: fmv.h.x ft9, s3 +; ZVFHMIN32-NEXT: fmv.h.x fa7, s10 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s11 +; ZVFHMIN32-NEXT: fsh fa5, 114(sp) # 2-byte Folded Spill +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a3, a0, 5 +; ZVFHMIN32-NEXT: add a0, a3, a0 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: addi a0, a0, 880 +; ZVFHMIN32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN32-NEXT: vse16.v v24, (a2) +; ZVFHMIN32-NEXT: vmv.x.s a3, v0 +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 15 +; ZVFHMIN32-NEXT: vmv.x.s a5, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 14 +; ZVFHMIN32-NEXT: vmv.x.s ra, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13 +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12 +; ZVFHMIN32-NEXT: vmv.x.s a1, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11 +; ZVFHMIN32-NEXT: vmv.x.s s3, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10 +; ZVFHMIN32-NEXT: vmv.x.s a7, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9 +; ZVFHMIN32-NEXT: vmv.x.s a0, v8 +; ZVFHMIN32-NEXT: sw a0, 116(sp) # 4-byte Folded Spill +; ZVFHMIN32-NEXT: fmv.h.x fa3, s5 +; ZVFHMIN32-NEXT: vmv.x.s s5, v5 +; ZVFHMIN32-NEXT: fmv.h.x fa2, s7 +; ZVFHMIN32-NEXT: vmv.x.s s7, v17 +; ZVFHMIN32-NEXT: fmv.h.x fa1, s9 +; ZVFHMIN32-NEXT: vmv.x.s s9, v23 +; ZVFHMIN32-NEXT: fmv.h.x fa0, s8 +; ZVFHMIN32-NEXT: vmv.x.s s8, v19 +; ZVFHMIN32-NEXT: fmv.h.x ft0, s6 +; ZVFHMIN32-NEXT: vmv.x.s s6, v21 +; ZVFHMIN32-NEXT: fmv.h.x ft1, s4 +; ZVFHMIN32-NEXT: vmv.x.s s10, v3 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 +; ZVFHMIN32-NEXT: fsh fa5, 112(sp) # 2-byte Folded Spill +; ZVFHMIN32-NEXT: vmv.x.s s2, v24 +; ZVFHMIN32-NEXT: fmv.h.x fs9, a6 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 1 +; ZVFHMIN32-NEXT: add a0, sp, a0 +; ZVFHMIN32-NEXT: lh a6, 880(a0) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7 +; ZVFHMIN32-NEXT: fmv.h.x fs10, s2 +; ZVFHMIN32-NEXT: vmv.x.s a0, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6 +; ZVFHMIN32-NEXT: fmv.h.x fs11, s5 +; ZVFHMIN32-NEXT: feq.h s2, fs8, fs9 +; ZVFHMIN32-NEXT: fmv.h.x fs8, s7 +; ZVFHMIN32-NEXT: vmv.x.s s7, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5 +; ZVFHMIN32-NEXT: fmv.h.x fs9, s9 +; ZVFHMIN32-NEXT: feq.h s11, fs7, fs10 +; ZVFHMIN32-NEXT: fmv.h.x fs7, s8 +; ZVFHMIN32-NEXT: vmv.x.s s8, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4 +; ZVFHMIN32-NEXT: fmv.h.x fs10, s6 +; ZVFHMIN32-NEXT: feq.h s4, fs6, fs11 +; ZVFHMIN32-NEXT: fmv.h.x fs6, s10 +; ZVFHMIN32-NEXT: vmv.x.s s9, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3 +; ZVFHMIN32-NEXT: fmv.h.x fs11, a6 +; ZVFHMIN32-NEXT: feq.h s5, fs5, fs8 +; ZVFHMIN32-NEXT: fmv.h.x fs5, a0 +; ZVFHMIN32-NEXT: vmv.x.s a0, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2 +; ZVFHMIN32-NEXT: fmv.h.x fs8, s7 +; ZVFHMIN32-NEXT: feq.h s6, ft10, fs9 +; ZVFHMIN32-NEXT: fmv.h.x fs9, s8 +; ZVFHMIN32-NEXT: vmv.x.s a6, v8 +; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN32-NEXT: feq.h s7, ft8, fs7 +; ZVFHMIN32-NEXT: fmv.h.x fs7, a0 +; ZVFHMIN32-NEXT: vmv.x.s a0, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a6 +; ZVFHMIN32-NEXT: feq.h s8, ft2, fs10 +; ZVFHMIN32-NEXT: fmv.h.x fs10, a0 +; ZVFHMIN32-NEXT: feq.h s9, ft3, fs6 +; ZVFHMIN32-NEXT: fmv.h.x fs6, t6 +; ZVFHMIN32-NEXT: feq.h s10, ft4, fs11 +; ZVFHMIN32-NEXT: fmv.h.x fs11, t0 +; ZVFHMIN32-NEXT: feq.h t0, ft5, fs5 +; ZVFHMIN32-NEXT: fmv.h.x fs5, t1 +; ZVFHMIN32-NEXT: feq.h t1, ft6, fs8 +; ZVFHMIN32-NEXT: fmv.h.x ft10, t2 +; ZVFHMIN32-NEXT: feq.h t2, fa6, fs9 +; ZVFHMIN32-NEXT: fmv.h.x ft8, t3 +; ZVFHMIN32-NEXT: feq.h t3, fs0, fa5 +; ZVFHMIN32-NEXT: fmv.h.x ft2, t4 +; ZVFHMIN32-NEXT: feq.h t4, fs1, fs7 +; ZVFHMIN32-NEXT: fmv.h.x ft3, t5 +; ZVFHMIN32-NEXT: feq.h t5, fs2, fa4 +; ZVFHMIN32-NEXT: fmv.h.x ft4, a3 +; ZVFHMIN32-NEXT: feq.h t6, fs3, fs10 +; ZVFHMIN32-NEXT: fmv.h.x ft5, a5 +; ZVFHMIN32-NEXT: feq.h a0, fs4, fs6 +; ZVFHMIN32-NEXT: fmv.h.x ft6, ra +; ZVFHMIN32-NEXT: feq.h a5, ft7, fs11 +; ZVFHMIN32-NEXT: fmv.h.x ft7, a2 +; ZVFHMIN32-NEXT: lh a2, 704(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa6, a1 +; ZVFHMIN32-NEXT: feq.h a6, ft11, fs5 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 ; ZVFHMIN32-NEXT: lh a1, 448(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 160(sp) -; ZVFHMIN32-NEXT: lh a0, 702(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 160(sp) +; ZVFHMIN32-NEXT: lh a1, 702(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 446(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 159(sp) -; ZVFHMIN32-NEXT: lh a0, 700(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 159(sp) +; ZVFHMIN32-NEXT: lh a1, 700(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 444(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 158(sp) -; ZVFHMIN32-NEXT: lh a0, 698(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 158(sp) +; ZVFHMIN32-NEXT: lh a1, 698(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 442(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 157(sp) -; ZVFHMIN32-NEXT: lh a0, 696(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 157(sp) +; ZVFHMIN32-NEXT: lh a1, 696(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 440(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 156(sp) -; ZVFHMIN32-NEXT: lh a0, 694(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 156(sp) +; ZVFHMIN32-NEXT: lh a1, 694(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 438(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 155(sp) -; ZVFHMIN32-NEXT: lh a0, 692(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 155(sp) +; ZVFHMIN32-NEXT: lh a1, 692(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 436(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 154(sp) -; ZVFHMIN32-NEXT: lh a0, 690(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 154(sp) +; ZVFHMIN32-NEXT: lh a1, 690(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 434(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 153(sp) -; ZVFHMIN32-NEXT: lh a0, 688(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 153(sp) +; ZVFHMIN32-NEXT: lh a1, 688(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 432(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 152(sp) -; ZVFHMIN32-NEXT: lh a0, 686(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 152(sp) +; ZVFHMIN32-NEXT: lh a1, 686(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 430(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 151(sp) -; ZVFHMIN32-NEXT: lh a0, 684(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 151(sp) +; ZVFHMIN32-NEXT: lh a1, 684(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 428(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 150(sp) -; ZVFHMIN32-NEXT: lh a0, 682(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 150(sp) +; ZVFHMIN32-NEXT: lh a1, 682(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 426(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 149(sp) -; ZVFHMIN32-NEXT: lh a0, 680(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 149(sp) +; ZVFHMIN32-NEXT: lh a1, 680(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 424(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 148(sp) -; ZVFHMIN32-NEXT: lh a0, 678(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 148(sp) +; ZVFHMIN32-NEXT: lh a1, 678(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 422(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 147(sp) -; ZVFHMIN32-NEXT: lh a0, 676(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 147(sp) +; ZVFHMIN32-NEXT: lh a1, 676(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 420(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 146(sp) -; ZVFHMIN32-NEXT: lh a0, 674(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 146(sp) +; ZVFHMIN32-NEXT: lh a1, 674(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 418(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN32-NEXT: vmv.x.s a2, v0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 -; ZVFHMIN32-NEXT: sb a0, 145(sp) -; ZVFHMIN32-NEXT: lh a0, 672(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 145(sp) +; ZVFHMIN32-NEXT: lh a1, 672(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 416(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a2, 128(sp) -; ZVFHMIN32-NEXT: sb a0, 144(sp) -; ZVFHMIN32-NEXT: lh a0, 576(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb s2, 128(sp) +; ZVFHMIN32-NEXT: feq.h s2, ft9, ft10 +; ZVFHMIN32-NEXT: sb a1, 144(sp) +; ZVFHMIN32-NEXT: lh a1, 576(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 320(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 224(sp) -; ZVFHMIN32-NEXT: lh a0, 574(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 224(sp) +; ZVFHMIN32-NEXT: lh a1, 574(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 318(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 223(sp) -; ZVFHMIN32-NEXT: lh a0, 572(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 223(sp) +; ZVFHMIN32-NEXT: lh a1, 572(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 316(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 222(sp) -; ZVFHMIN32-NEXT: lh a0, 570(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 222(sp) +; ZVFHMIN32-NEXT: lh a1, 570(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 314(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 221(sp) -; ZVFHMIN32-NEXT: lh a0, 568(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 221(sp) +; ZVFHMIN32-NEXT: lh a1, 568(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 312(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 220(sp) -; ZVFHMIN32-NEXT: lh a0, 566(sp) -; ZVFHMIN32-NEXT: lh a1, 310(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 220(sp) +; ZVFHMIN32-NEXT: lh a1, 566(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 310(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 219(sp) -; ZVFHMIN32-NEXT: lh a0, 564(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 219(sp) +; ZVFHMIN32-NEXT: lh a1, 564(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 308(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 218(sp) -; ZVFHMIN32-NEXT: lh a0, 562(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 218(sp) +; ZVFHMIN32-NEXT: lh a1, 562(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 306(sp) -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 7 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 29 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 6 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 28 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 5 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 27 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 4 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 26 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 3 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 25 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 2 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 24 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 1 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 23 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v26, v8, 15 -; ZVFHMIN32-NEXT: vslidedown.vi v20, v8, 14 -; ZVFHMIN32-NEXT: vslidedown.vi v28, v8, 13 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v8, 12 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 1 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v4, v8, 11 -; ZVFHMIN32-NEXT: vslidedown.vi v2, v8, 10 -; ZVFHMIN32-NEXT: vslidedown.vi v30, v8, 9 -; ZVFHMIN32-NEXT: vslidedown.vi v22, v8, 8 -; ZVFHMIN32-NEXT: vmv.x.s a4, v16 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 217(sp) -; ZVFHMIN32-NEXT: lh a0, 560(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 217(sp) +; ZVFHMIN32-NEXT: lh a1, 560(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 304(sp) -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v3, v16, 7 -; ZVFHMIN32-NEXT: vslidedown.vi v31, v16, 6 -; ZVFHMIN32-NEXT: vslidedown.vi v5, v16, 5 -; ZVFHMIN32-NEXT: vslidedown.vi v23, v16, 4 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 3 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 21 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 2 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 20 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 1 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 22 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v18, v16, 15 -; ZVFHMIN32-NEXT: vslidedown.vi v14, v16, 14 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v16, 13 -; ZVFHMIN32-NEXT: vslidedown.vi v12, v16, 12 -; ZVFHMIN32-NEXT: vslidedown.vi v10, v16, 11 -; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 10 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 18 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 9 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 14 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v6, v16, 8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 216(sp) -; ZVFHMIN32-NEXT: lh a0, 558(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 216(sp) +; ZVFHMIN32-NEXT: lh a1, 558(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 302(sp) -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v13, v0, 7 -; ZVFHMIN32-NEXT: vslidedown.vi v29, v0, 6 -; ZVFHMIN32-NEXT: vslidedown.vi v11, v0, 5 -; ZVFHMIN32-NEXT: vslidedown.vi v7, v0, 4 -; ZVFHMIN32-NEXT: vslidedown.vi v9, v0, 3 -; ZVFHMIN32-NEXT: vslidedown.vi v21, v0, 2 -; ZVFHMIN32-NEXT: vslidedown.vi v27, v0, 1 -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 15 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 2 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 14 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 13 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 6 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 12 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 12 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 11 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 10 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 10 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 4 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vslidedown.vi v16, v0, 9 -; ZVFHMIN32-NEXT: vslidedown.vi v0, v0, 8 -; ZVFHMIN32-NEXT: addi a2, sp, 848 -; ZVFHMIN32-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill -; ZVFHMIN32-NEXT: vmv.x.s t4, v26 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 215(sp) -; ZVFHMIN32-NEXT: lh a0, 556(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 215(sp) +; ZVFHMIN32-NEXT: lh a1, 556(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 300(sp) -; ZVFHMIN32-NEXT: vmv.x.s t3, v20 -; ZVFHMIN32-NEXT: vmv.x.s t1, v28 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 214(sp) -; ZVFHMIN32-NEXT: lh a0, 554(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 214(sp) +; ZVFHMIN32-NEXT: lh a1, 554(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 298(sp) -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 1 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t2, v0 -; ZVFHMIN32-NEXT: vmv.x.s t0, v4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 213(sp) -; ZVFHMIN32-NEXT: lh a0, 552(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 213(sp) +; ZVFHMIN32-NEXT: lh a1, 552(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 296(sp) -; ZVFHMIN32-NEXT: vmv.x.s a7, v2 -; ZVFHMIN32-NEXT: vmv.x.s a6, v30 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 212(sp) -; ZVFHMIN32-NEXT: lh a0, 550(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 212(sp) +; ZVFHMIN32-NEXT: lh a1, 550(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 294(sp) -; ZVFHMIN32-NEXT: vmv.x.s a5, v22 -; ZVFHMIN32-NEXT: vmv.x.s a2, v18 -; ZVFHMIN32-NEXT: sw a2, 112(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 211(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 211(sp) ; ZVFHMIN32-NEXT: lh a1, 548(sp) -; ZVFHMIN32-NEXT: lh t5, 292(sp) -; ZVFHMIN32-NEXT: vmv.x.s a0, v14 -; ZVFHMIN32-NEXT: sw a0, 116(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: vmv.x.s a0, v8 -; ZVFHMIN32-NEXT: sw a0, 124(sp) # 4-byte Folded Spill ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 +; ZVFHMIN32-NEXT: lh a1, 292(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: sb a1, 210(sp) ; ZVFHMIN32-NEXT: lh a1, 546(sp) -; ZVFHMIN32-NEXT: lh t5, 290(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN32-NEXT: vmv.x.s a4, v24 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 290(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa3, t5 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: sb a1, 209(sp) ; ZVFHMIN32-NEXT: lh a1, 544(sp) -; ZVFHMIN32-NEXT: lh t5, 288(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t5 +; ZVFHMIN32-NEXT: lh a1, 288(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a4, 192(sp) +; ZVFHMIN32-NEXT: sb s11, 192(sp) +; ZVFHMIN32-NEXT: feq.h s11, fa7, ft8 ; ZVFHMIN32-NEXT: sb a1, 208(sp) -; ZVFHMIN32-NEXT: lh t5, 738(sp) -; ZVFHMIN32-NEXT: lh t6, 482(sp) -; ZVFHMIN32-NEXT: vmv.x.s a0, v12 -; ZVFHMIN32-NEXT: sw a0, 108(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: vmv.x.s a0, v10 -; ZVFHMIN32-NEXT: sw a0, 120(sp) # 4-byte Folded Spill -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN32-NEXT: sb t5, 177(sp) -; ZVFHMIN32-NEXT: lh t5, 736(sp) -; ZVFHMIN32-NEXT: lh t6, 480(sp) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 29 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s5, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 28 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s6, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN32-NEXT: sb t5, 176(sp) -; ZVFHMIN32-NEXT: lh t5, 734(sp) -; ZVFHMIN32-NEXT: lh t6, 478(sp) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 27 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s7, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 26 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s8, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN32-NEXT: sb t5, 175(sp) -; ZVFHMIN32-NEXT: lh t5, 732(sp) -; ZVFHMIN32-NEXT: lh t6, 476(sp) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 25 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s4, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 24 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s3, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN32-NEXT: sb t5, 174(sp) -; ZVFHMIN32-NEXT: lh t6, 730(sp) -; ZVFHMIN32-NEXT: lh s9, 474(sp) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: li a1, 23 -; ZVFHMIN32-NEXT: mul a0, a0, a1 -; ZVFHMIN32-NEXT: add a0, sp, a0 -; ZVFHMIN32-NEXT: lh s2, 848(a0) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t5, v3 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s9 -; ZVFHMIN32-NEXT: feq.h t6, fa5, fa4 -; ZVFHMIN32-NEXT: sb t6, 173(sp) -; ZVFHMIN32-NEXT: lh s9, 728(sp) -; ZVFHMIN32-NEXT: lh s10, 472(sp) -; ZVFHMIN32-NEXT: vmv.x.s t6, v31 -; ZVFHMIN32-NEXT: vmv.x.s ra, v13 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN32-NEXT: feq.h s9, fa5, fa4 -; ZVFHMIN32-NEXT: sb s9, 172(sp) -; ZVFHMIN32-NEXT: lh s9, 726(sp) -; ZVFHMIN32-NEXT: lh s10, 470(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v29 -; ZVFHMIN32-NEXT: vmv.x.s a3, v11 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s9 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN32-NEXT: feq.h s9, fa5, fa4 -; ZVFHMIN32-NEXT: sb s9, 171(sp) -; ZVFHMIN32-NEXT: lh s10, 724(sp) -; ZVFHMIN32-NEXT: lh s11, 468(sp) -; ZVFHMIN32-NEXT: vmv.x.s a4, v7 -; ZVFHMIN32-NEXT: vmv.x.s s9, v9 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s10 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s11 -; ZVFHMIN32-NEXT: feq.h s10, fa5, fa4 -; ZVFHMIN32-NEXT: sb s10, 170(sp) -; ZVFHMIN32-NEXT: lh a0, 722(sp) +; ZVFHMIN32-NEXT: lh a1, 738(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 482(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 177(sp) +; ZVFHMIN32-NEXT: lh a1, 736(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 480(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 176(sp) +; ZVFHMIN32-NEXT: lh a1, 734(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 478(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 175(sp) +; ZVFHMIN32-NEXT: lh a1, 732(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 476(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 174(sp) +; ZVFHMIN32-NEXT: lh a1, 730(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 474(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 173(sp) +; ZVFHMIN32-NEXT: lh a1, 728(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 472(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 172(sp) +; ZVFHMIN32-NEXT: lh a1, 726(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 470(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 171(sp) +; ZVFHMIN32-NEXT: lh a1, 724(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 468(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 170(sp) +; ZVFHMIN32-NEXT: lh a1, 722(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 466(sp) -; ZVFHMIN32-NEXT: vmv.x.s s10, v21 -; ZVFHMIN32-NEXT: vmv.x.s s11, v27 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 169(sp) -; ZVFHMIN32-NEXT: lh a0, 720(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 169(sp) +; ZVFHMIN32-NEXT: lh a1, 720(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 464(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, s5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s6 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN32-NEXT: sb a0, 168(sp) -; ZVFHMIN32-NEXT: lh a0, 718(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 168(sp) +; ZVFHMIN32-NEXT: lh a1, 718(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 462(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa3, s7 -; ZVFHMIN32-NEXT: fmv.h.x fa2, s8 -; ZVFHMIN32-NEXT: fmv.h.x fa1, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa0, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa1, fa0 -; ZVFHMIN32-NEXT: fmv.h.x fa1, ra -; ZVFHMIN32-NEXT: sb a0, 167(sp) -; ZVFHMIN32-NEXT: lh a0, 716(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa0, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 167(sp) +; ZVFHMIN32-NEXT: lh a1, 716(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 460(sp) -; ZVFHMIN32-NEXT: feq.h s5, fa5, fa1 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s4 ; ZVFHMIN32-NEXT: sb a1, 166(sp) ; ZVFHMIN32-NEXT: lh a1, 714(sp) -; ZVFHMIN32-NEXT: lh a2, 458(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h a3, fa3, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 458(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa4, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa4, s3 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: sb a1, 165(sp) ; ZVFHMIN32-NEXT: lh a1, 712(sp) -; ZVFHMIN32-NEXT: lh a2, 456(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa3, a4 -; ZVFHMIN32-NEXT: feq.h a4, fa2, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa3, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, s2 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 456(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: sb a1, 164(sp) ; ZVFHMIN32-NEXT: lh a1, 710(sp) -; ZVFHMIN32-NEXT: lh a2, 454(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa2, s9 -; ZVFHMIN32-NEXT: feq.h s2, fa5, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s10 -; ZVFHMIN32-NEXT: fmv.h.x fa2, s11 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 454(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN32-NEXT: sb a1, 163(sp) ; ZVFHMIN32-NEXT: lh a1, 708(sp) -; ZVFHMIN32-NEXT: lh a2, 452(sp) -; ZVFHMIN32-NEXT: feq.h s3, fa4, fa5 -; ZVFHMIN32-NEXT: feq.h s4, fa3, fa2 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: lh a1, 452(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, s3 ; ZVFHMIN32-NEXT: sb a1, 162(sp) ; ZVFHMIN32-NEXT: lh a1, 706(sp) ; ZVFHMIN32-NEXT: lh a2, 450(sp) -; ZVFHMIN32-NEXT: sb s4, 129(sp) -; ZVFHMIN32-NEXT: sb s3, 130(sp) -; ZVFHMIN32-NEXT: sb s2, 131(sp) -; ZVFHMIN32-NEXT: sb a4, 132(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: sb a3, 133(sp) -; ZVFHMIN32-NEXT: sb a0, 134(sp) -; ZVFHMIN32-NEXT: sb s5, 135(sp) +; ZVFHMIN32-NEXT: sb s10, 129(sp) +; ZVFHMIN32-NEXT: flh fa4, 114(sp) # 2-byte Folded Reload +; ZVFHMIN32-NEXT: feq.h s10, fa4, ft2 +; ZVFHMIN32-NEXT: sb s9, 130(sp) +; ZVFHMIN32-NEXT: feq.h s9, fa3, ft3 +; ZVFHMIN32-NEXT: sb s8, 131(sp) +; ZVFHMIN32-NEXT: feq.h ra, fa2, ft4 +; ZVFHMIN32-NEXT: sb s7, 132(sp) +; ZVFHMIN32-NEXT: feq.h s3, fa1, ft5 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h s7, fa0, ft6 +; ZVFHMIN32-NEXT: fmv.h.x fa3, a2 +; ZVFHMIN32-NEXT: feq.h s8, ft0, ft7 +; ZVFHMIN32-NEXT: sb s6, 133(sp) +; ZVFHMIN32-NEXT: feq.h s6, ft1, fa6 +; ZVFHMIN32-NEXT: sb s5, 134(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa4, fa3 +; ZVFHMIN32-NEXT: sb s4, 135(sp) +; ZVFHMIN32-NEXT: flh fa4, 112(sp) # 2-byte Folded Reload +; ZVFHMIN32-NEXT: feq.h s4, fa4, fa5 ; ZVFHMIN32-NEXT: sb a1, 161(sp) -; ZVFHMIN32-NEXT: lh a0, 610(sp) +; ZVFHMIN32-NEXT: lh a1, 610(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 354(sp) -; ZVFHMIN32-NEXT: vmv.x.s s6, v5 -; ZVFHMIN32-NEXT: vmv.x.s s5, v23 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 241(sp) -; ZVFHMIN32-NEXT: lh a0, 608(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 241(sp) +; ZVFHMIN32-NEXT: lh a1, 608(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 352(sp) -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 21 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s4, 848(a2) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 20 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s3, 848(a2) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 240(sp) -; ZVFHMIN32-NEXT: lh a0, 606(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 240(sp) +; ZVFHMIN32-NEXT: lh a1, 606(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 350(sp) -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 22 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: lh s2, 848(a2) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa4, fa3 -; ZVFHMIN32-NEXT: sb a0, 239(sp) -; ZVFHMIN32-NEXT: lh a0, 604(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 239(sp) +; ZVFHMIN32-NEXT: lh a1, 604(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 348(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 7 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN32-NEXT: sb a0, 238(sp) -; ZVFHMIN32-NEXT: lh a0, 602(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 238(sp) +; ZVFHMIN32-NEXT: lh a1, 602(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 346(sp) -; ZVFHMIN32-NEXT: vmv.x.s a2, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 6 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN32-NEXT: sb a0, 237(sp) -; ZVFHMIN32-NEXT: lh a0, 600(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 237(sp) +; ZVFHMIN32-NEXT: lh a1, 600(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 344(sp) -; ZVFHMIN32-NEXT: vmv.x.s a3, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 5 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN32-NEXT: sb a0, 236(sp) -; ZVFHMIN32-NEXT: lh a0, 598(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 236(sp) +; ZVFHMIN32-NEXT: lh a1, 598(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 342(sp) -; ZVFHMIN32-NEXT: vmv.x.s a4, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 4 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN32-NEXT: sb a0, 235(sp) -; ZVFHMIN32-NEXT: lh a0, 596(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 235(sp) +; ZVFHMIN32-NEXT: lh a1, 596(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 340(sp) -; ZVFHMIN32-NEXT: vmv.x.s s8, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 3 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN32-NEXT: sb a0, 234(sp) -; ZVFHMIN32-NEXT: lh a0, 594(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 234(sp) +; ZVFHMIN32-NEXT: lh a1, 594(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 338(sp) -; ZVFHMIN32-NEXT: vmv.x.s s9, v8 -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN32-NEXT: sb a0, 233(sp) -; ZVFHMIN32-NEXT: lh a0, 592(sp) -; ZVFHMIN32-NEXT: vmv.x.s a1, v8 -; ZVFHMIN32-NEXT: lh t5, 336(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 1 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN32-NEXT: vmv.x.s s7, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa2, t5 -; ZVFHMIN32-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa3, a2 -; ZVFHMIN32-NEXT: sb a0, 232(sp) -; ZVFHMIN32-NEXT: lh a0, 590(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa2, a3 -; ZVFHMIN32-NEXT: lh a2, 334(sp) -; ZVFHMIN32-NEXT: feq.h t5, fa5, fa3 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: feq.h t6, fa4, fa2 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s6 -; ZVFHMIN32-NEXT: sb a0, 231(sp) -; ZVFHMIN32-NEXT: lh a0, 588(sp) -; ZVFHMIN32-NEXT: lh a2, 332(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s5 -; ZVFHMIN32-NEXT: sb a0, 230(sp) -; ZVFHMIN32-NEXT: lh a0, 586(sp) -; ZVFHMIN32-NEXT: lh a2, 330(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s8 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s4 -; ZVFHMIN32-NEXT: sb a0, 229(sp) -; ZVFHMIN32-NEXT: lh a0, 584(sp) -; ZVFHMIN32-NEXT: lh a2, 328(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s9 -; ZVFHMIN32-NEXT: feq.h s4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s3 -; ZVFHMIN32-NEXT: sb a0, 228(sp) -; ZVFHMIN32-NEXT: lh a0, 582(sp) -; ZVFHMIN32-NEXT: lh a2, 326(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN32-NEXT: sb a0, 227(sp) -; ZVFHMIN32-NEXT: lh a0, 580(sp) -; ZVFHMIN32-NEXT: lh a2, 324(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s7 -; ZVFHMIN32-NEXT: feq.h s2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 226(sp) -; ZVFHMIN32-NEXT: lh a0, 578(sp) +; ZVFHMIN32-NEXT: sb a1, 233(sp) +; ZVFHMIN32-NEXT: lh a1, 592(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 336(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 232(sp) +; ZVFHMIN32-NEXT: lh a1, 590(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 334(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 231(sp) +; ZVFHMIN32-NEXT: lh a1, 588(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 332(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 230(sp) +; ZVFHMIN32-NEXT: lh a1, 586(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 330(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 229(sp) +; ZVFHMIN32-NEXT: lh a1, 584(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 328(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 228(sp) +; ZVFHMIN32-NEXT: lh a1, 582(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 326(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 227(sp) +; ZVFHMIN32-NEXT: lh a1, 580(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 324(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN32-NEXT: sb a1, 226(sp) +; ZVFHMIN32-NEXT: lh a1, 578(sp) ; ZVFHMIN32-NEXT: lh a2, 322(sp) -; ZVFHMIN32-NEXT: sb s2, 193(sp) -; ZVFHMIN32-NEXT: sb a1, 194(sp) -; ZVFHMIN32-NEXT: sb s4, 195(sp) -; ZVFHMIN32-NEXT: sb a4, 196(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN32-NEXT: sb t6, 193(sp) +; ZVFHMIN32-NEXT: sb t5, 194(sp) +; ZVFHMIN32-NEXT: sb t4, 195(sp) +; ZVFHMIN32-NEXT: sb t3, 196(sp) +; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a3, 197(sp) -; ZVFHMIN32-NEXT: sb t6, 198(sp) -; ZVFHMIN32-NEXT: sb t5, 199(sp) -; ZVFHMIN32-NEXT: sb a0, 225(sp) -; ZVFHMIN32-NEXT: lh a0, 766(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb t2, 197(sp) +; ZVFHMIN32-NEXT: sb t1, 198(sp) +; ZVFHMIN32-NEXT: sb t0, 199(sp) +; ZVFHMIN32-NEXT: sb a1, 225(sp) +; ZVFHMIN32-NEXT: lh a1, 766(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 510(sp) -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 18 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s s2, v8 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: li a3, 14 -; ZVFHMIN32-NEXT: mul a2, a2, a3 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s t6, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 191(sp) -; ZVFHMIN32-NEXT: lh a0, 764(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 191(sp) +; ZVFHMIN32-NEXT: lh a1, 764(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 508(sp) -; ZVFHMIN32-NEXT: vmv.x.s t5, v6 -; ZVFHMIN32-NEXT: csrr a2, vlenb -; ZVFHMIN32-NEXT: slli a2, a2, 2 -; ZVFHMIN32-NEXT: add a2, sp, a2 -; ZVFHMIN32-NEXT: addi a2, a2, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s a2, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 190(sp) -; ZVFHMIN32-NEXT: lh a0, 762(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 190(sp) +; ZVFHMIN32-NEXT: lh a1, 762(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 506(sp) -; ZVFHMIN32-NEXT: csrr a3, vlenb -; ZVFHMIN32-NEXT: slli a3, a3, 3 -; ZVFHMIN32-NEXT: add a3, sp, a3 -; ZVFHMIN32-NEXT: addi a3, a3, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s a3, v8 -; ZVFHMIN32-NEXT: csrr a4, vlenb -; ZVFHMIN32-NEXT: li s3, 6 -; ZVFHMIN32-NEXT: mul a4, a4, s3 -; ZVFHMIN32-NEXT: add a4, sp, a4 -; ZVFHMIN32-NEXT: addi a4, a4, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s a4, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 189(sp) -; ZVFHMIN32-NEXT: lh a0, 760(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 189(sp) +; ZVFHMIN32-NEXT: lh a1, 760(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 504(sp) -; ZVFHMIN32-NEXT: csrr s3, vlenb -; ZVFHMIN32-NEXT: li s4, 12 -; ZVFHMIN32-NEXT: mul s3, s3, s4 -; ZVFHMIN32-NEXT: add s3, sp, s3 -; ZVFHMIN32-NEXT: addi s3, s3, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s s6, v8 -; ZVFHMIN32-NEXT: csrr s3, vlenb -; ZVFHMIN32-NEXT: li s4, 10 -; ZVFHMIN32-NEXT: mul s3, s3, s4 -; ZVFHMIN32-NEXT: add s3, sp, s3 -; ZVFHMIN32-NEXT: addi s3, s3, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s s4, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 188(sp) -; ZVFHMIN32-NEXT: lh a0, 758(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 188(sp) +; ZVFHMIN32-NEXT: lh a1, 758(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 502(sp) -; ZVFHMIN32-NEXT: csrr s3, vlenb -; ZVFHMIN32-NEXT: slli s3, s3, 4 -; ZVFHMIN32-NEXT: add s3, sp, s3 -; ZVFHMIN32-NEXT: addi s3, s3, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s s5, v8 -; ZVFHMIN32-NEXT: vmv.x.s s3, v16 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN32-NEXT: sb a0, 187(sp) -; ZVFHMIN32-NEXT: lh a0, 756(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 187(sp) +; ZVFHMIN32-NEXT: lh a1, 756(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 500(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN32-NEXT: feq.h t4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t3 -; ZVFHMIN32-NEXT: sb a0, 186(sp) -; ZVFHMIN32-NEXT: lh a0, 754(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 186(sp) +; ZVFHMIN32-NEXT: lh a1, 754(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 498(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN32-NEXT: feq.h t3, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t1 -; ZVFHMIN32-NEXT: sb a0, 185(sp) -; ZVFHMIN32-NEXT: lh a0, 752(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 185(sp) +; ZVFHMIN32-NEXT: lh a1, 752(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 496(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN32-NEXT: feq.h t1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN32-NEXT: sb a0, 184(sp) -; ZVFHMIN32-NEXT: lh a0, 750(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 184(sp) +; ZVFHMIN32-NEXT: lh a1, 750(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 494(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s6 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN32-NEXT: sb a0, 183(sp) -; ZVFHMIN32-NEXT: lh a0, 748(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 183(sp) +; ZVFHMIN32-NEXT: lh a1, 748(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 492(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s4 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN32-NEXT: sb a0, 182(sp) -; ZVFHMIN32-NEXT: lh a0, 746(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 182(sp) +; ZVFHMIN32-NEXT: lh a1, 746(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 490(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s5 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN32-NEXT: sb a0, 181(sp) -; ZVFHMIN32-NEXT: lh a0, 744(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 181(sp) +; ZVFHMIN32-NEXT: lh a1, 744(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: lh a1, 488(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, s3 -; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a5 -; ZVFHMIN32-NEXT: addi a1, sp, 848 -; ZVFHMIN32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN32-NEXT: vmv.x.s a1, v8 -; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 15 -; ZVFHMIN32-NEXT: vmv.x.s a5, v8 -; ZVFHMIN32-NEXT: sb a0, 180(sp) -; ZVFHMIN32-NEXT: lh a0, 742(sp) -; ZVFHMIN32-NEXT: lh a7, 486(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: sb a1, 180(sp) +; ZVFHMIN32-NEXT: lh a1, 742(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN32-NEXT: lh a1, 486(sp) ; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: sb a0, 179(sp) -; ZVFHMIN32-NEXT: lh a0, 740(sp) -; ZVFHMIN32-NEXT: lh a7, 484(sp) -; ZVFHMIN32-NEXT: sb a2, 140(sp) -; ZVFHMIN32-NEXT: sb t1, 141(sp) -; ZVFHMIN32-NEXT: sb t3, 142(sp) -; ZVFHMIN32-NEXT: sb t4, 143(sp) -; ZVFHMIN32-NEXT: sb a1, 136(sp) -; ZVFHMIN32-NEXT: sb a6, 137(sp) -; ZVFHMIN32-NEXT: sb a4, 138(sp) -; ZVFHMIN32-NEXT: sb a3, 139(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN32-NEXT: lw a2, 120(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: lw a2, 116(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN32-NEXT: sb a1, 179(sp) +; ZVFHMIN32-NEXT: lh a2, 740(sp) +; ZVFHMIN32-NEXT: lh a3, 484(sp) +; ZVFHMIN32-NEXT: sb s2, 140(sp) +; ZVFHMIN32-NEXT: sb a6, 141(sp) +; ZVFHMIN32-NEXT: sb a5, 142(sp) +; ZVFHMIN32-NEXT: sb a0, 143(sp) +; ZVFHMIN32-NEXT: sb ra, 136(sp) +; ZVFHMIN32-NEXT: sb s9, 137(sp) +; ZVFHMIN32-NEXT: sb s10, 138(sp) +; ZVFHMIN32-NEXT: sb s11, 139(sp) +; ZVFHMIN32-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a3 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 178(sp) ; ZVFHMIN32-NEXT: lh a0, 638(sp) -; ZVFHMIN32-NEXT: lh a1, 382(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 14 -; ZVFHMIN32-NEXT: vmv.x.s t3, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 382(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 255(sp) ; ZVFHMIN32-NEXT: lh a0, 636(sp) -; ZVFHMIN32-NEXT: lh a1, 380(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 13 -; ZVFHMIN32-NEXT: vmv.x.s t2, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 380(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 254(sp) ; ZVFHMIN32-NEXT: lh a0, 634(sp) -; ZVFHMIN32-NEXT: lh a1, 378(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 12 -; ZVFHMIN32-NEXT: vmv.x.s t1, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 378(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 253(sp) ; ZVFHMIN32-NEXT: lh a0, 632(sp) -; ZVFHMIN32-NEXT: lh a1, 376(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 11 -; ZVFHMIN32-NEXT: vmv.x.s t0, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 376(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 252(sp) ; ZVFHMIN32-NEXT: lh a0, 630(sp) -; ZVFHMIN32-NEXT: lh a1, 374(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 10 -; ZVFHMIN32-NEXT: vmv.x.s a7, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 374(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 251(sp) ; ZVFHMIN32-NEXT: lh a0, 628(sp) -; ZVFHMIN32-NEXT: lh a1, 372(sp) -; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 9 -; ZVFHMIN32-NEXT: vmv.x.s a6, v8 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 372(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 112(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: sb a0, 250(sp) ; ZVFHMIN32-NEXT: lh a0, 626(sp) -; ZVFHMIN32-NEXT: lh a1, 370(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 370(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 116(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: sb a0, 249(sp) ; ZVFHMIN32-NEXT: lh a0, 624(sp) -; ZVFHMIN32-NEXT: lh a1, 368(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 368(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 124(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: sb a0, 248(sp) ; ZVFHMIN32-NEXT: lh a0, 622(sp) -; ZVFHMIN32-NEXT: lh a1, 366(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN32-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 366(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 108(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: sb a0, 247(sp) ; ZVFHMIN32-NEXT: lh a0, 620(sp) -; ZVFHMIN32-NEXT: lh a1, 364(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN32-NEXT: feq.h a5, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 364(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: lw a1, 120(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN32-NEXT: sb a0, 246(sp) ; ZVFHMIN32-NEXT: lh a0, 618(sp) -; ZVFHMIN32-NEXT: lh a1, 362(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, t0 -; ZVFHMIN32-NEXT: feq.h t0, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 362(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, s2 ; ZVFHMIN32-NEXT: sb a0, 245(sp) ; ZVFHMIN32-NEXT: lh a0, 616(sp) -; ZVFHMIN32-NEXT: lh a1, 360(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN32-NEXT: feq.h a7, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 360(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t6 ; ZVFHMIN32-NEXT: sb a0, 244(sp) ; ZVFHMIN32-NEXT: lh a0, 614(sp) -; ZVFHMIN32-NEXT: lh a1, 358(sp) -; ZVFHMIN32-NEXT: fmv.h.x fa4, a6 -; ZVFHMIN32-NEXT: feq.h a6, fa5, fa4 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: lh a0, 358(sp) +; ZVFHMIN32-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN32-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN32-NEXT: lw a2, 124(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN32-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN32-NEXT: vslidedown.vi v8, v24, 8 -; ZVFHMIN32-NEXT: vmv.x.s a1, v8 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: vmv.x.s a2, v8 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN32-NEXT: sb a0, 243(sp) ; ZVFHMIN32-NEXT: lh a0, 612(sp) -; ZVFHMIN32-NEXT: lh a1, 356(sp) -; ZVFHMIN32-NEXT: sb a5, 204(sp) -; ZVFHMIN32-NEXT: sb a4, 205(sp) -; ZVFHMIN32-NEXT: sb a2, 206(sp) -; ZVFHMIN32-NEXT: sb a3, 207(sp) -; ZVFHMIN32-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN32-NEXT: sb a2, 200(sp) -; ZVFHMIN32-NEXT: sb a6, 201(sp) -; ZVFHMIN32-NEXT: sb a7, 202(sp) -; ZVFHMIN32-NEXT: sb t0, 203(sp) -; ZVFHMIN32-NEXT: li a2, 128 +; ZVFHMIN32-NEXT: lh a2, 356(sp) +; ZVFHMIN32-NEXT: sb s6, 204(sp) +; ZVFHMIN32-NEXT: sb s8, 205(sp) +; ZVFHMIN32-NEXT: sb s7, 206(sp) +; ZVFHMIN32-NEXT: sb s3, 207(sp) +; ZVFHMIN32-NEXT: feq.h a3, fa5, fa4 +; ZVFHMIN32-NEXT: sb a3, 200(sp) +; ZVFHMIN32-NEXT: sb a1, 201(sp) +; ZVFHMIN32-NEXT: sb a4, 202(sp) +; ZVFHMIN32-NEXT: sb s4, 203(sp) +; ZVFHMIN32-NEXT: li a1, 128 ; ZVFHMIN32-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN32-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN32-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN32-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN32-NEXT: sb a0, 242(sp) ; ZVFHMIN32-NEXT: addi a0, sp, 128 -; ZVFHMIN32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; ZVFHMIN32-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; ZVFHMIN32-NEXT: vle8.v v8, (a0) ; ZVFHMIN32-NEXT: vand.vi v8, v8, 1 ; ZVFHMIN32-NEXT: vmsne.vi v0, v8, 0 -; ZVFHMIN32-NEXT: addi sp, s0, -896 -; ZVFHMIN32-NEXT: .cfi_def_cfa sp, 896 -; ZVFHMIN32-NEXT: lw ra, 892(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s0, 888(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s2, 884(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s3, 880(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s4, 876(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s5, 872(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s6, 868(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s7, 864(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s8, 860(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s9, 856(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s10, 852(sp) # 4-byte Folded Reload -; ZVFHMIN32-NEXT: lw s11, 848(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, s0, -1024 +; ZVFHMIN32-NEXT: .cfi_def_cfa sp, 1024 +; ZVFHMIN32-NEXT: lw ra, 1020(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s0, 1016(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s2, 1012(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s3, 1008(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s4, 1004(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s5, 1000(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s6, 996(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s7, 992(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s8, 988(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s9, 984(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s10, 980(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: lw s11, 976(sp) # 4-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs0, 968(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs1, 960(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs2, 952(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs3, 944(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs4, 936(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs5, 928(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs6, 920(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs7, 912(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs8, 904(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs9, 896(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs10, 888(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: fld fs11, 880(sp) # 8-byte Folded Reload ; ZVFHMIN32-NEXT: .cfi_restore ra ; ZVFHMIN32-NEXT: .cfi_restore s0 ; ZVFHMIN32-NEXT: .cfi_restore s2 @@ -2242,26 +2333,50 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN32-NEXT: .cfi_restore s9 ; ZVFHMIN32-NEXT: .cfi_restore s10 ; ZVFHMIN32-NEXT: .cfi_restore s11 -; ZVFHMIN32-NEXT: addi sp, sp, 896 +; ZVFHMIN32-NEXT: .cfi_restore fs0 +; ZVFHMIN32-NEXT: .cfi_restore fs1 +; ZVFHMIN32-NEXT: .cfi_restore fs2 +; ZVFHMIN32-NEXT: .cfi_restore fs3 +; ZVFHMIN32-NEXT: .cfi_restore fs4 +; ZVFHMIN32-NEXT: .cfi_restore fs5 +; ZVFHMIN32-NEXT: .cfi_restore fs6 +; ZVFHMIN32-NEXT: .cfi_restore fs7 +; ZVFHMIN32-NEXT: .cfi_restore fs8 +; ZVFHMIN32-NEXT: .cfi_restore fs9 +; ZVFHMIN32-NEXT: .cfi_restore fs10 +; ZVFHMIN32-NEXT: .cfi_restore fs11 +; ZVFHMIN32-NEXT: addi sp, sp, 1024 ; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN32-NEXT: ret ; ; ZVFHMIN64-LABEL: fcmp_oeq_vv_v128f16: ; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -896 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 896 -; ZVFHMIN64-NEXT: sd ra, 888(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s0, 880(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s2, 872(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s3, 864(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s4, 856(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s5, 848(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s6, 840(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s7, 832(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s8, 824(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s9, 816(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s10, 808(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: sd s11, 800(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: addi sp, sp, -1024 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 1024 +; ZVFHMIN64-NEXT: sd ra, 1016(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s0, 1008(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s2, 1000(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s3, 992(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s4, 984(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s5, 976(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s6, 968(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s7, 960(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s8, 952(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s9, 944(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s10, 936(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: sd s11, 928(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs0, 920(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs1, 912(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs2, 904(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs3, 896(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs4, 888(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs5, 880(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs6, 872(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs7, 864(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs8, 856(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs9, 848(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs10, 840(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fsd fs11, 832(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: .cfi_offset ra, -8 ; ZVFHMIN64-NEXT: .cfi_offset s0, -16 ; ZVFHMIN64-NEXT: .cfi_offset s2, -24 @@ -2274,1096 +2389,1175 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: .cfi_offset s9, -80 ; ZVFHMIN64-NEXT: .cfi_offset s10, -88 ; ZVFHMIN64-NEXT: .cfi_offset s11, -96 -; ZVFHMIN64-NEXT: addi s0, sp, 896 +; ZVFHMIN64-NEXT: .cfi_offset fs0, -104 +; ZVFHMIN64-NEXT: .cfi_offset fs1, -112 +; ZVFHMIN64-NEXT: .cfi_offset fs2, -120 +; ZVFHMIN64-NEXT: .cfi_offset fs3, -128 +; ZVFHMIN64-NEXT: .cfi_offset fs4, -136 +; ZVFHMIN64-NEXT: .cfi_offset fs5, -144 +; ZVFHMIN64-NEXT: .cfi_offset fs6, -152 +; ZVFHMIN64-NEXT: .cfi_offset fs7, -160 +; ZVFHMIN64-NEXT: .cfi_offset fs8, -168 +; ZVFHMIN64-NEXT: .cfi_offset fs9, -176 +; ZVFHMIN64-NEXT: .cfi_offset fs10, -184 +; ZVFHMIN64-NEXT: .cfi_offset fs11, -192 +; ZVFHMIN64-NEXT: addi s0, sp, 1024 ; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 ; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: li a2, 30 +; ZVFHMIN64-NEXT: li a2, 41 ; ZVFHMIN64-NEXT: mul a1, a1, a2 ; ZVFHMIN64-NEXT: sub sp, sp, a1 ; ZVFHMIN64-NEXT: andi sp, sp, -128 -; ZVFHMIN64-NEXT: addi a1, a0, 128 -; ZVFHMIN64-NEXT: li a2, 64 -; ZVFHMIN64-NEXT: addi a3, sp, 640 -; ZVFHMIN64-NEXT: addi a4, sp, 384 -; ZVFHMIN64-NEXT: addi a5, sp, 512 -; ZVFHMIN64-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; ZVFHMIN64-NEXT: addi a3, a0, 128 +; ZVFHMIN64-NEXT: li a1, 64 +; ZVFHMIN64-NEXT: addi a4, sp, 640 +; ZVFHMIN64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: vle16.v v24, (a3) +; ZVFHMIN64-NEXT: csrr a3, vlenb +; ZVFHMIN64-NEXT: slli a5, a3, 5 +; ZVFHMIN64-NEXT: add a3, a5, a3 +; ZVFHMIN64-NEXT: add a3, sp, a3 +; ZVFHMIN64-NEXT: addi a3, a3, 832 +; ZVFHMIN64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; ZVFHMIN64-NEXT: vle16.v v0, (a0) -; ZVFHMIN64-NEXT: addi a0, sp, 256 -; ZVFHMIN64-NEXT: vle16.v v24, (a1) -; ZVFHMIN64-NEXT: vse16.v v8, (a3) -; ZVFHMIN64-NEXT: vse16.v v0, (a4) -; ZVFHMIN64-NEXT: vse16.v v16, (a5) -; ZVFHMIN64-NEXT: vse16.v v24, (a0) -; ZVFHMIN64-NEXT: lh a0, 704(sp) +; ZVFHMIN64-NEXT: vse16.v v8, (a4) +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 7 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 5 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 6 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a3, a0, 5 +; ZVFHMIN64-NEXT: sub a0, a3, a0 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 5 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 30 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 4 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 29 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 3 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 28 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 2 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 27 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 1 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 26 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 15 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 24 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 14 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 22 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 13 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 20 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 12 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 18 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 11 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a3, a0, 3 +; ZVFHMIN64-NEXT: add a0, a3, a0 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 10 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a3, a0, 3 +; ZVFHMIN64-NEXT: sub a0, a3, a0 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 9 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a3, a0, 2 +; ZVFHMIN64-NEXT: add a0, a3, a0 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v8, v8, 8 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a3, a0, 1 +; ZVFHMIN64-NEXT: add a0, a3, a0 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: addi a0, sp, 384 +; ZVFHMIN64-NEXT: addi a3, sp, 512 +; ZVFHMIN64-NEXT: vmv.x.s a5, v16 +; ZVFHMIN64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN64-NEXT: vse16.v v0, (a0) +; ZVFHMIN64-NEXT: vse16.v v16, (a3) +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 7 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 11 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 6 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 12 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 5 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 13 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 4 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 14 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 3 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a3, a0, 4 +; ZVFHMIN64-NEXT: sub a0, a3, a0 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 2 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 4 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 1 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a3, a0, 4 +; ZVFHMIN64-NEXT: add a0, a3, a0 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 15 +; ZVFHMIN64-NEXT: addi a0, sp, 832 +; ZVFHMIN64-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vslidedown.vi v4, v16, 14 +; ZVFHMIN64-NEXT: vslidedown.vi v2, v16, 13 +; ZVFHMIN64-NEXT: vslidedown.vi v24, v16, 12 +; ZVFHMIN64-NEXT: vslidedown.vi v22, v16, 11 +; ZVFHMIN64-NEXT: vslidedown.vi v20, v16, 10 +; ZVFHMIN64-NEXT: vslidedown.vi v18, v16, 9 +; ZVFHMIN64-NEXT: vslidedown.vi v16, v16, 8 +; ZVFHMIN64-NEXT: vmv.x.s a6, v0 +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v5, v0, 7 +; ZVFHMIN64-NEXT: vslidedown.vi v17, v0, 6 +; ZVFHMIN64-NEXT: vslidedown.vi v23, v0, 5 +; ZVFHMIN64-NEXT: vslidedown.vi v19, v0, 4 +; ZVFHMIN64-NEXT: vslidedown.vi v21, v0, 3 +; ZVFHMIN64-NEXT: vslidedown.vi v3, v0, 2 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v0, 1 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 1 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v8, v0, 15 +; ZVFHMIN64-NEXT: vslidedown.vi v10, v0, 14 +; ZVFHMIN64-NEXT: vslidedown.vi v12, v0, 13 +; ZVFHMIN64-NEXT: vslidedown.vi v14, v0, 12 +; ZVFHMIN64-NEXT: vslidedown.vi v26, v0, 11 +; ZVFHMIN64-NEXT: vslidedown.vi v28, v0, 10 +; ZVFHMIN64-NEXT: vslidedown.vi v30, v0, 9 +; ZVFHMIN64-NEXT: vslidedown.vi v0, v0, 8 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: li a3, 24 +; ZVFHMIN64-NEXT: mul a0, a0, a3 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vl2r.v v6, (a0) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a0, v6 +; ZVFHMIN64-NEXT: csrr a3, vlenb +; ZVFHMIN64-NEXT: li a4, 22 +; ZVFHMIN64-NEXT: mul a3, a3, a4 +; ZVFHMIN64-NEXT: add a3, sp, a3 +; ZVFHMIN64-NEXT: addi a3, a3, 832 +; ZVFHMIN64-NEXT: vl2r.v v6, (a3) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a3, v6 +; ZVFHMIN64-NEXT: csrr a4, vlenb +; ZVFHMIN64-NEXT: li a7, 20 +; ZVFHMIN64-NEXT: mul a4, a4, a7 +; ZVFHMIN64-NEXT: add a4, sp, a4 +; ZVFHMIN64-NEXT: addi a4, a4, 832 +; ZVFHMIN64-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s a7, v6 +; ZVFHMIN64-NEXT: csrr a4, vlenb +; ZVFHMIN64-NEXT: li t0, 18 +; ZVFHMIN64-NEXT: mul a4, a4, t0 +; ZVFHMIN64-NEXT: add a4, sp, a4 +; ZVFHMIN64-NEXT: addi a4, a4, 832 +; ZVFHMIN64-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s3, v6 +; ZVFHMIN64-NEXT: csrr a4, vlenb +; ZVFHMIN64-NEXT: slli t0, a4, 3 +; ZVFHMIN64-NEXT: add a4, t0, a4 +; ZVFHMIN64-NEXT: add a4, sp, a4 +; ZVFHMIN64-NEXT: addi a4, a4, 832 +; ZVFHMIN64-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s10, v6 +; ZVFHMIN64-NEXT: csrr a4, vlenb +; ZVFHMIN64-NEXT: slli t0, a4, 3 +; ZVFHMIN64-NEXT: sub a4, t0, a4 +; ZVFHMIN64-NEXT: add a4, sp, a4 +; ZVFHMIN64-NEXT: addi a4, a4, 832 +; ZVFHMIN64-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s11, v6 +; ZVFHMIN64-NEXT: csrr a4, vlenb +; ZVFHMIN64-NEXT: slli t0, a4, 2 +; ZVFHMIN64-NEXT: add a4, t0, a4 +; ZVFHMIN64-NEXT: add a4, sp, a4 +; ZVFHMIN64-NEXT: addi a4, a4, 832 +; ZVFHMIN64-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s5, v6 +; ZVFHMIN64-NEXT: csrr a4, vlenb +; ZVFHMIN64-NEXT: slli t0, a4, 1 +; ZVFHMIN64-NEXT: add a4, t0, a4 +; ZVFHMIN64-NEXT: add a4, sp, a4 +; ZVFHMIN64-NEXT: addi a4, a4, 832 +; ZVFHMIN64-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s7, v6 +; ZVFHMIN64-NEXT: addi a4, sp, 832 +; ZVFHMIN64-NEXT: vl2r.v v6, (a4) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vmv.x.s s9, v6 +; ZVFHMIN64-NEXT: vmv.x.s s8, v4 +; ZVFHMIN64-NEXT: vmv.x.s s6, v2 +; ZVFHMIN64-NEXT: vmv.x.s s4, v24 +; ZVFHMIN64-NEXT: vmv.x.s s2, v22 +; ZVFHMIN64-NEXT: vmv.x.s a4, v20 +; ZVFHMIN64-NEXT: vmv.x.s t0, v18 +; ZVFHMIN64-NEXT: sd t0, 112(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s t0, v16 +; ZVFHMIN64-NEXT: sd t0, 120(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s t6, v8 +; ZVFHMIN64-NEXT: vmv.x.s t0, v10 +; ZVFHMIN64-NEXT: vmv.x.s t1, v12 +; ZVFHMIN64-NEXT: vmv.x.s t2, v14 +; ZVFHMIN64-NEXT: vmv.x.s t3, v26 +; ZVFHMIN64-NEXT: vmv.x.s t4, v28 +; ZVFHMIN64-NEXT: vmv.x.s t5, v30 +; ZVFHMIN64-NEXT: fmv.h.x fs8, a2 +; ZVFHMIN64-NEXT: fmv.h.x fs7, a5 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 5 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fs6, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a5, a2, 5 +; ZVFHMIN64-NEXT: sub a2, a5, a2 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fs5, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a5, 30 +; ZVFHMIN64-NEXT: mul a2, a2, a5 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x ft10, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a5, 29 +; ZVFHMIN64-NEXT: mul a2, a2, a5 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x ft8, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a5, 28 +; ZVFHMIN64-NEXT: mul a2, a2, a5 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x ft2, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a5, 27 +; ZVFHMIN64-NEXT: mul a2, a2, a5 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x ft3, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a5, 26 +; ZVFHMIN64-NEXT: mul a2, a2, a5 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x ft4, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a5, 11 +; ZVFHMIN64-NEXT: mul a2, a2, a5 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x ft5, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a5, 12 +; ZVFHMIN64-NEXT: mul a2, a2, a5 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x ft6, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a5, 13 +; ZVFHMIN64-NEXT: mul a2, a2, a5 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa6, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: li a5, 14 +; ZVFHMIN64-NEXT: mul a2, a2, a5 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fs0, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a5, a2, 4 +; ZVFHMIN64-NEXT: sub a2, a5, a2 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fs1, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a2, a2, 4 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fs2, a2 +; ZVFHMIN64-NEXT: csrr a2, vlenb +; ZVFHMIN64-NEXT: slli a5, a2, 4 +; ZVFHMIN64-NEXT: add a2, a5, a2 +; ZVFHMIN64-NEXT: add a2, sp, a2 +; ZVFHMIN64-NEXT: lh a2, 832(a2) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fs3, a2 +; ZVFHMIN64-NEXT: addi a2, sp, 256 +; ZVFHMIN64-NEXT: fmv.h.x fs4, a0 +; ZVFHMIN64-NEXT: fmv.h.x ft7, a3 +; ZVFHMIN64-NEXT: fmv.h.x ft11, a7 +; ZVFHMIN64-NEXT: fmv.h.x ft9, s3 +; ZVFHMIN64-NEXT: fmv.h.x fa7, s10 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s11 +; ZVFHMIN64-NEXT: fsh fa5, 102(sp) # 2-byte Folded Spill +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a3, a0, 5 +; ZVFHMIN64-NEXT: add a0, a3, a0 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: addi a0, a0, 832 +; ZVFHMIN64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFHMIN64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFHMIN64-NEXT: vse16.v v24, (a2) +; ZVFHMIN64-NEXT: vmv.x.s a3, v0 +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 15 +; ZVFHMIN64-NEXT: vmv.x.s a5, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 14 +; ZVFHMIN64-NEXT: vmv.x.s ra, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13 +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12 +; ZVFHMIN64-NEXT: vmv.x.s a1, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11 +; ZVFHMIN64-NEXT: vmv.x.s s3, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10 +; ZVFHMIN64-NEXT: vmv.x.s a7, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9 +; ZVFHMIN64-NEXT: vmv.x.s a0, v8 +; ZVFHMIN64-NEXT: sd a0, 104(sp) # 8-byte Folded Spill +; ZVFHMIN64-NEXT: fmv.h.x fa3, s5 +; ZVFHMIN64-NEXT: vmv.x.s s5, v5 +; ZVFHMIN64-NEXT: fmv.h.x fa2, s7 +; ZVFHMIN64-NEXT: vmv.x.s s7, v17 +; ZVFHMIN64-NEXT: fmv.h.x fa1, s9 +; ZVFHMIN64-NEXT: vmv.x.s s9, v23 +; ZVFHMIN64-NEXT: fmv.h.x fa0, s8 +; ZVFHMIN64-NEXT: vmv.x.s s8, v19 +; ZVFHMIN64-NEXT: fmv.h.x ft0, s6 +; ZVFHMIN64-NEXT: vmv.x.s s6, v21 +; ZVFHMIN64-NEXT: fmv.h.x ft1, s4 +; ZVFHMIN64-NEXT: vmv.x.s s10, v3 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 +; ZVFHMIN64-NEXT: fsh fa5, 100(sp) # 2-byte Folded Spill +; ZVFHMIN64-NEXT: vmv.x.s s2, v24 +; ZVFHMIN64-NEXT: fmv.h.x fs9, a6 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 1 +; ZVFHMIN64-NEXT: add a0, sp, a0 +; ZVFHMIN64-NEXT: lh a6, 832(a0) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7 +; ZVFHMIN64-NEXT: fmv.h.x fs10, s2 +; ZVFHMIN64-NEXT: vmv.x.s a0, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6 +; ZVFHMIN64-NEXT: fmv.h.x fs11, s5 +; ZVFHMIN64-NEXT: feq.h s2, fs8, fs9 +; ZVFHMIN64-NEXT: fmv.h.x fs8, s7 +; ZVFHMIN64-NEXT: vmv.x.s s7, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5 +; ZVFHMIN64-NEXT: fmv.h.x fs9, s9 +; ZVFHMIN64-NEXT: feq.h s11, fs7, fs10 +; ZVFHMIN64-NEXT: fmv.h.x fs7, s8 +; ZVFHMIN64-NEXT: vmv.x.s s8, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4 +; ZVFHMIN64-NEXT: fmv.h.x fs10, s6 +; ZVFHMIN64-NEXT: feq.h s4, fs6, fs11 +; ZVFHMIN64-NEXT: fmv.h.x fs6, s10 +; ZVFHMIN64-NEXT: vmv.x.s s9, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3 +; ZVFHMIN64-NEXT: fmv.h.x fs11, a6 +; ZVFHMIN64-NEXT: feq.h s5, fs5, fs8 +; ZVFHMIN64-NEXT: fmv.h.x fs5, a0 +; ZVFHMIN64-NEXT: vmv.x.s a0, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2 +; ZVFHMIN64-NEXT: fmv.h.x fs8, s7 +; ZVFHMIN64-NEXT: feq.h s6, ft10, fs9 +; ZVFHMIN64-NEXT: fmv.h.x fs9, s8 +; ZVFHMIN64-NEXT: vmv.x.s a6, v8 +; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 +; ZVFHMIN64-NEXT: feq.h s7, ft8, fs7 +; ZVFHMIN64-NEXT: fmv.h.x fs7, a0 +; ZVFHMIN64-NEXT: vmv.x.s a0, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a6 +; ZVFHMIN64-NEXT: feq.h s8, ft2, fs10 +; ZVFHMIN64-NEXT: fmv.h.x fs10, a0 +; ZVFHMIN64-NEXT: feq.h s9, ft3, fs6 +; ZVFHMIN64-NEXT: fmv.h.x fs6, t6 +; ZVFHMIN64-NEXT: feq.h s10, ft4, fs11 +; ZVFHMIN64-NEXT: fmv.h.x fs11, t0 +; ZVFHMIN64-NEXT: feq.h t0, ft5, fs5 +; ZVFHMIN64-NEXT: fmv.h.x fs5, t1 +; ZVFHMIN64-NEXT: feq.h t1, ft6, fs8 +; ZVFHMIN64-NEXT: fmv.h.x ft10, t2 +; ZVFHMIN64-NEXT: feq.h t2, fa6, fs9 +; ZVFHMIN64-NEXT: fmv.h.x ft8, t3 +; ZVFHMIN64-NEXT: feq.h t3, fs0, fa5 +; ZVFHMIN64-NEXT: fmv.h.x ft2, t4 +; ZVFHMIN64-NEXT: feq.h t4, fs1, fs7 +; ZVFHMIN64-NEXT: fmv.h.x ft3, t5 +; ZVFHMIN64-NEXT: feq.h t5, fs2, fa4 +; ZVFHMIN64-NEXT: fmv.h.x ft4, a3 +; ZVFHMIN64-NEXT: feq.h t6, fs3, fs10 +; ZVFHMIN64-NEXT: fmv.h.x ft5, a5 +; ZVFHMIN64-NEXT: feq.h a0, fs4, fs6 +; ZVFHMIN64-NEXT: fmv.h.x ft6, ra +; ZVFHMIN64-NEXT: feq.h a5, ft7, fs11 +; ZVFHMIN64-NEXT: fmv.h.x ft7, a2 +; ZVFHMIN64-NEXT: lh a2, 704(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa6, a1 +; ZVFHMIN64-NEXT: feq.h a6, ft11, fs5 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 ; ZVFHMIN64-NEXT: lh a1, 448(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 160(sp) -; ZVFHMIN64-NEXT: lh a0, 702(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 160(sp) +; ZVFHMIN64-NEXT: lh a1, 702(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 446(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 159(sp) -; ZVFHMIN64-NEXT: lh a0, 700(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 159(sp) +; ZVFHMIN64-NEXT: lh a1, 700(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 444(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 158(sp) -; ZVFHMIN64-NEXT: lh a0, 698(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 158(sp) +; ZVFHMIN64-NEXT: lh a1, 698(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 442(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 157(sp) -; ZVFHMIN64-NEXT: lh a0, 696(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 157(sp) +; ZVFHMIN64-NEXT: lh a1, 696(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 440(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 156(sp) -; ZVFHMIN64-NEXT: lh a0, 694(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 156(sp) +; ZVFHMIN64-NEXT: lh a1, 694(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 438(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 155(sp) -; ZVFHMIN64-NEXT: lh a0, 692(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 155(sp) +; ZVFHMIN64-NEXT: lh a1, 692(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 436(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 154(sp) -; ZVFHMIN64-NEXT: lh a0, 690(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 154(sp) +; ZVFHMIN64-NEXT: lh a1, 690(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 434(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 153(sp) -; ZVFHMIN64-NEXT: lh a0, 688(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 153(sp) +; ZVFHMIN64-NEXT: lh a1, 688(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 432(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 152(sp) -; ZVFHMIN64-NEXT: lh a0, 686(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 152(sp) +; ZVFHMIN64-NEXT: lh a1, 686(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 430(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 151(sp) -; ZVFHMIN64-NEXT: lh a0, 684(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 151(sp) +; ZVFHMIN64-NEXT: lh a1, 684(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 428(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 150(sp) -; ZVFHMIN64-NEXT: lh a0, 682(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 150(sp) +; ZVFHMIN64-NEXT: lh a1, 682(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 426(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 149(sp) -; ZVFHMIN64-NEXT: lh a0, 680(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 149(sp) +; ZVFHMIN64-NEXT: lh a1, 680(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 424(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 148(sp) -; ZVFHMIN64-NEXT: lh a0, 678(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 148(sp) +; ZVFHMIN64-NEXT: lh a1, 678(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 422(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 147(sp) -; ZVFHMIN64-NEXT: lh a0, 676(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 147(sp) +; ZVFHMIN64-NEXT: lh a1, 676(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 420(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 146(sp) -; ZVFHMIN64-NEXT: lh a0, 674(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 146(sp) +; ZVFHMIN64-NEXT: lh a1, 674(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 418(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 -; ZVFHMIN64-NEXT: vmv.x.s a2, v0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 -; ZVFHMIN64-NEXT: sb a0, 145(sp) -; ZVFHMIN64-NEXT: lh a0, 672(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 145(sp) +; ZVFHMIN64-NEXT: lh a1, 672(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 416(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a2, 128(sp) -; ZVFHMIN64-NEXT: sb a0, 144(sp) -; ZVFHMIN64-NEXT: lh a0, 576(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb s2, 128(sp) +; ZVFHMIN64-NEXT: feq.h s2, ft9, ft10 +; ZVFHMIN64-NEXT: sb a1, 144(sp) +; ZVFHMIN64-NEXT: lh a1, 576(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 320(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 224(sp) -; ZVFHMIN64-NEXT: lh a0, 574(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 224(sp) +; ZVFHMIN64-NEXT: lh a1, 574(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 318(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 223(sp) -; ZVFHMIN64-NEXT: lh a0, 572(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 223(sp) +; ZVFHMIN64-NEXT: lh a1, 572(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 316(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 222(sp) -; ZVFHMIN64-NEXT: lh a0, 570(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 222(sp) +; ZVFHMIN64-NEXT: lh a1, 570(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 314(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 221(sp) -; ZVFHMIN64-NEXT: lh a0, 568(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 221(sp) +; ZVFHMIN64-NEXT: lh a1, 568(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 312(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 220(sp) -; ZVFHMIN64-NEXT: lh a0, 566(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 220(sp) +; ZVFHMIN64-NEXT: lh a1, 566(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 310(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 219(sp) -; ZVFHMIN64-NEXT: lh a0, 564(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 219(sp) +; ZVFHMIN64-NEXT: lh a1, 564(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 308(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 218(sp) -; ZVFHMIN64-NEXT: lh a0, 562(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 218(sp) +; ZVFHMIN64-NEXT: lh a1, 562(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 306(sp) -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 7 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 29 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 6 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 28 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 5 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 27 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 4 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 26 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 3 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 25 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 2 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 24 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 1 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 23 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs1r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v26, v8, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v20, v8, 14 -; ZVFHMIN64-NEXT: vslidedown.vi v28, v8, 13 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v8, 12 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 1 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v10, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v4, v8, 11 -; ZVFHMIN64-NEXT: vslidedown.vi v2, v8, 10 -; ZVFHMIN64-NEXT: vslidedown.vi v30, v8, 9 -; ZVFHMIN64-NEXT: vslidedown.vi v22, v8, 8 -; ZVFHMIN64-NEXT: vmv.x.s a4, v16 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 217(sp) -; ZVFHMIN64-NEXT: lh a0, 560(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 217(sp) +; ZVFHMIN64-NEXT: lh a1, 560(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 304(sp) -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v3, v16, 7 -; ZVFHMIN64-NEXT: vslidedown.vi v31, v16, 6 -; ZVFHMIN64-NEXT: vslidedown.vi v5, v16, 5 -; ZVFHMIN64-NEXT: vslidedown.vi v23, v16, 4 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 3 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 21 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 2 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 20 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 1 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 22 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v18, v16, 15 -; ZVFHMIN64-NEXT: vslidedown.vi v14, v16, 14 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v16, 13 -; ZVFHMIN64-NEXT: vslidedown.vi v12, v16, 12 -; ZVFHMIN64-NEXT: vslidedown.vi v10, v16, 11 -; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 10 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 18 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 9 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 14 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v6, v16, 8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 216(sp) -; ZVFHMIN64-NEXT: lh a0, 558(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 216(sp) +; ZVFHMIN64-NEXT: lh a1, 558(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 302(sp) -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v13, v0, 7 -; ZVFHMIN64-NEXT: vslidedown.vi v29, v0, 6 -; ZVFHMIN64-NEXT: vslidedown.vi v11, v0, 5 -; ZVFHMIN64-NEXT: vslidedown.vi v7, v0, 4 -; ZVFHMIN64-NEXT: vslidedown.vi v9, v0, 3 -; ZVFHMIN64-NEXT: vslidedown.vi v21, v0, 2 -; ZVFHMIN64-NEXT: vslidedown.vi v27, v0, 1 -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 15 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 2 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 14 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 13 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 6 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 12 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 12 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 11 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 10 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 10 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 4 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vslidedown.vi v16, v0, 9 -; ZVFHMIN64-NEXT: vslidedown.vi v0, v0, 8 -; ZVFHMIN64-NEXT: addi a2, sp, 800 -; ZVFHMIN64-NEXT: vs2r.v v0, (a2) # Unknown-size Folded Spill -; ZVFHMIN64-NEXT: vmv.x.s t4, v26 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 215(sp) -; ZVFHMIN64-NEXT: lh a0, 556(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 215(sp) +; ZVFHMIN64-NEXT: lh a1, 556(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 300(sp) -; ZVFHMIN64-NEXT: vmv.x.s t3, v20 -; ZVFHMIN64-NEXT: vmv.x.s t1, v28 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 214(sp) -; ZVFHMIN64-NEXT: lh a0, 554(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 214(sp) +; ZVFHMIN64-NEXT: lh a1, 554(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 298(sp) -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 1 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vl2r.v v0, (a2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t2, v0 -; ZVFHMIN64-NEXT: vmv.x.s t0, v4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 213(sp) -; ZVFHMIN64-NEXT: lh a0, 552(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 213(sp) +; ZVFHMIN64-NEXT: lh a1, 552(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 296(sp) -; ZVFHMIN64-NEXT: vmv.x.s a7, v2 -; ZVFHMIN64-NEXT: vmv.x.s a6, v30 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 212(sp) -; ZVFHMIN64-NEXT: lh a0, 550(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 212(sp) +; ZVFHMIN64-NEXT: lh a1, 550(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 294(sp) -; ZVFHMIN64-NEXT: vmv.x.s a5, v22 -; ZVFHMIN64-NEXT: vmv.x.s a2, v18 -; ZVFHMIN64-NEXT: sd a2, 96(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 211(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 211(sp) ; ZVFHMIN64-NEXT: lh a1, 548(sp) -; ZVFHMIN64-NEXT: lh t5, 292(sp) -; ZVFHMIN64-NEXT: vmv.x.s a0, v14 -; ZVFHMIN64-NEXT: sd a0, 104(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: vmv.x.s a0, v8 -; ZVFHMIN64-NEXT: sd a0, 120(sp) # 8-byte Folded Spill ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 +; ZVFHMIN64-NEXT: lh a1, 292(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: sb a1, 210(sp) ; ZVFHMIN64-NEXT: lh a1, 546(sp) -; ZVFHMIN64-NEXT: lh t5, 290(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 -; ZVFHMIN64-NEXT: vmv.x.s a4, v24 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 290(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa3, t5 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: sb a1, 209(sp) ; ZVFHMIN64-NEXT: lh a1, 544(sp) -; ZVFHMIN64-NEXT: lh t5, 288(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t5 +; ZVFHMIN64-NEXT: lh a1, 288(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a4, 192(sp) +; ZVFHMIN64-NEXT: sb s11, 192(sp) +; ZVFHMIN64-NEXT: feq.h s11, fa7, ft8 ; ZVFHMIN64-NEXT: sb a1, 208(sp) -; ZVFHMIN64-NEXT: lh t5, 738(sp) -; ZVFHMIN64-NEXT: lh t6, 482(sp) -; ZVFHMIN64-NEXT: vmv.x.s a0, v12 -; ZVFHMIN64-NEXT: sd a0, 88(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: vmv.x.s a0, v10 -; ZVFHMIN64-NEXT: sd a0, 112(sp) # 8-byte Folded Spill -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN64-NEXT: sb t5, 177(sp) -; ZVFHMIN64-NEXT: lh t5, 736(sp) -; ZVFHMIN64-NEXT: lh t6, 480(sp) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 29 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s5, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 28 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s6, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN64-NEXT: sb t5, 176(sp) -; ZVFHMIN64-NEXT: lh t5, 734(sp) -; ZVFHMIN64-NEXT: lh t6, 478(sp) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 27 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s7, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 26 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s8, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN64-NEXT: sb t5, 175(sp) -; ZVFHMIN64-NEXT: lh t5, 732(sp) -; ZVFHMIN64-NEXT: lh t6, 476(sp) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 25 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s4, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 24 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s3, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa4 -; ZVFHMIN64-NEXT: sb t5, 174(sp) -; ZVFHMIN64-NEXT: lh t6, 730(sp) -; ZVFHMIN64-NEXT: lh s9, 474(sp) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: li a1, 23 -; ZVFHMIN64-NEXT: mul a0, a0, a1 -; ZVFHMIN64-NEXT: add a0, sp, a0 -; ZVFHMIN64-NEXT: lh s2, 800(a0) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t5, v3 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s9 -; ZVFHMIN64-NEXT: feq.h t6, fa5, fa4 -; ZVFHMIN64-NEXT: sb t6, 173(sp) -; ZVFHMIN64-NEXT: lh s9, 728(sp) -; ZVFHMIN64-NEXT: lh s10, 472(sp) -; ZVFHMIN64-NEXT: vmv.x.s t6, v31 -; ZVFHMIN64-NEXT: vmv.x.s ra, v13 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN64-NEXT: feq.h s9, fa5, fa4 -; ZVFHMIN64-NEXT: sb s9, 172(sp) -; ZVFHMIN64-NEXT: lh s9, 726(sp) -; ZVFHMIN64-NEXT: lh s10, 470(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v29 -; ZVFHMIN64-NEXT: vmv.x.s a3, v11 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s9 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s10 -; ZVFHMIN64-NEXT: feq.h s9, fa5, fa4 -; ZVFHMIN64-NEXT: sb s9, 171(sp) -; ZVFHMIN64-NEXT: lh s10, 724(sp) -; ZVFHMIN64-NEXT: lh s11, 468(sp) -; ZVFHMIN64-NEXT: vmv.x.s a4, v7 -; ZVFHMIN64-NEXT: vmv.x.s s9, v9 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s10 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s11 -; ZVFHMIN64-NEXT: feq.h s10, fa5, fa4 -; ZVFHMIN64-NEXT: sb s10, 170(sp) -; ZVFHMIN64-NEXT: lh a0, 722(sp) +; ZVFHMIN64-NEXT: lh a1, 738(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 482(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 177(sp) +; ZVFHMIN64-NEXT: lh a1, 736(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 480(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 176(sp) +; ZVFHMIN64-NEXT: lh a1, 734(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 478(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 175(sp) +; ZVFHMIN64-NEXT: lh a1, 732(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 476(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 174(sp) +; ZVFHMIN64-NEXT: lh a1, 730(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 474(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 173(sp) +; ZVFHMIN64-NEXT: lh a1, 728(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 472(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 172(sp) +; ZVFHMIN64-NEXT: lh a1, 726(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 470(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 171(sp) +; ZVFHMIN64-NEXT: lh a1, 724(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 468(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 170(sp) +; ZVFHMIN64-NEXT: lh a1, 722(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 466(sp) -; ZVFHMIN64-NEXT: vmv.x.s s10, v21 -; ZVFHMIN64-NEXT: vmv.x.s s11, v27 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 169(sp) -; ZVFHMIN64-NEXT: lh a0, 720(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 169(sp) +; ZVFHMIN64-NEXT: lh a1, 720(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 464(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, s5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s6 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN64-NEXT: sb a0, 168(sp) -; ZVFHMIN64-NEXT: lh a0, 718(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 168(sp) +; ZVFHMIN64-NEXT: lh a1, 718(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 462(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa3, s7 -; ZVFHMIN64-NEXT: fmv.h.x fa2, s8 -; ZVFHMIN64-NEXT: fmv.h.x fa1, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa0, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa1, fa0 -; ZVFHMIN64-NEXT: fmv.h.x fa1, ra -; ZVFHMIN64-NEXT: sb a0, 167(sp) -; ZVFHMIN64-NEXT: lh a0, 716(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa0, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 167(sp) +; ZVFHMIN64-NEXT: lh a1, 716(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 460(sp) -; ZVFHMIN64-NEXT: feq.h s5, fa5, fa1 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s4 ; ZVFHMIN64-NEXT: sb a1, 166(sp) ; ZVFHMIN64-NEXT: lh a1, 714(sp) -; ZVFHMIN64-NEXT: lh a2, 458(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h a3, fa3, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 458(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa4, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa4, s3 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: sb a1, 165(sp) ; ZVFHMIN64-NEXT: lh a1, 712(sp) -; ZVFHMIN64-NEXT: lh a2, 456(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa3, a4 -; ZVFHMIN64-NEXT: feq.h a4, fa2, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa3, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, s2 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 456(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: sb a1, 164(sp) ; ZVFHMIN64-NEXT: lh a1, 710(sp) -; ZVFHMIN64-NEXT: lh a2, 454(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa2, s9 -; ZVFHMIN64-NEXT: feq.h s2, fa5, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s10 -; ZVFHMIN64-NEXT: fmv.h.x fa2, s11 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 454(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 ; ZVFHMIN64-NEXT: sb a1, 163(sp) ; ZVFHMIN64-NEXT: lh a1, 708(sp) -; ZVFHMIN64-NEXT: lh a2, 452(sp) -; ZVFHMIN64-NEXT: feq.h s3, fa4, fa5 -; ZVFHMIN64-NEXT: feq.h s4, fa3, fa2 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: lh a1, 452(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, s3 ; ZVFHMIN64-NEXT: sb a1, 162(sp) ; ZVFHMIN64-NEXT: lh a1, 706(sp) ; ZVFHMIN64-NEXT: lh a2, 450(sp) -; ZVFHMIN64-NEXT: sb s4, 129(sp) -; ZVFHMIN64-NEXT: sb s3, 130(sp) -; ZVFHMIN64-NEXT: sb s2, 131(sp) -; ZVFHMIN64-NEXT: sb a4, 132(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: sb a3, 133(sp) -; ZVFHMIN64-NEXT: sb a0, 134(sp) -; ZVFHMIN64-NEXT: sb s5, 135(sp) +; ZVFHMIN64-NEXT: sb s10, 129(sp) +; ZVFHMIN64-NEXT: flh fa4, 102(sp) # 2-byte Folded Reload +; ZVFHMIN64-NEXT: feq.h s10, fa4, ft2 +; ZVFHMIN64-NEXT: sb s9, 130(sp) +; ZVFHMIN64-NEXT: feq.h s9, fa3, ft3 +; ZVFHMIN64-NEXT: sb s8, 131(sp) +; ZVFHMIN64-NEXT: feq.h ra, fa2, ft4 +; ZVFHMIN64-NEXT: sb s7, 132(sp) +; ZVFHMIN64-NEXT: feq.h s3, fa1, ft5 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h s7, fa0, ft6 +; ZVFHMIN64-NEXT: fmv.h.x fa3, a2 +; ZVFHMIN64-NEXT: feq.h s8, ft0, ft7 +; ZVFHMIN64-NEXT: sb s6, 133(sp) +; ZVFHMIN64-NEXT: feq.h s6, ft1, fa6 +; ZVFHMIN64-NEXT: sb s5, 134(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa4, fa3 +; ZVFHMIN64-NEXT: sb s4, 135(sp) +; ZVFHMIN64-NEXT: flh fa4, 100(sp) # 2-byte Folded Reload +; ZVFHMIN64-NEXT: feq.h s4, fa4, fa5 ; ZVFHMIN64-NEXT: sb a1, 161(sp) -; ZVFHMIN64-NEXT: lh a0, 610(sp) +; ZVFHMIN64-NEXT: lh a1, 610(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 354(sp) -; ZVFHMIN64-NEXT: vmv.x.s s6, v5 -; ZVFHMIN64-NEXT: vmv.x.s s5, v23 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 241(sp) -; ZVFHMIN64-NEXT: lh a0, 608(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 241(sp) +; ZVFHMIN64-NEXT: lh a1, 608(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 352(sp) -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 21 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s4, 800(a2) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 20 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s3, 800(a2) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 240(sp) -; ZVFHMIN64-NEXT: lh a0, 606(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 240(sp) +; ZVFHMIN64-NEXT: lh a1, 606(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 350(sp) -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 22 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: lh s2, 800(a2) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa4, fa3 -; ZVFHMIN64-NEXT: sb a0, 239(sp) -; ZVFHMIN64-NEXT: lh a0, 604(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 239(sp) +; ZVFHMIN64-NEXT: lh a1, 604(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 348(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t6 -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 7 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN64-NEXT: sb a0, 238(sp) -; ZVFHMIN64-NEXT: lh a0, 602(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 238(sp) +; ZVFHMIN64-NEXT: lh a1, 602(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 346(sp) -; ZVFHMIN64-NEXT: vmv.x.s a2, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 6 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN64-NEXT: sb a0, 237(sp) -; ZVFHMIN64-NEXT: lh a0, 600(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 237(sp) +; ZVFHMIN64-NEXT: lh a1, 600(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 344(sp) -; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 5 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN64-NEXT: sb a0, 236(sp) -; ZVFHMIN64-NEXT: lh a0, 598(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 236(sp) +; ZVFHMIN64-NEXT: lh a1, 598(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 342(sp) -; ZVFHMIN64-NEXT: vmv.x.s a4, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 4 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN64-NEXT: sb a0, 235(sp) -; ZVFHMIN64-NEXT: lh a0, 596(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 235(sp) +; ZVFHMIN64-NEXT: lh a1, 596(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 340(sp) -; ZVFHMIN64-NEXT: vmv.x.s s8, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 3 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN64-NEXT: sb a0, 234(sp) -; ZVFHMIN64-NEXT: lh a0, 594(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 234(sp) +; ZVFHMIN64-NEXT: lh a1, 594(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 338(sp) -; ZVFHMIN64-NEXT: vmv.x.s s9, v8 -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa2, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN64-NEXT: sb a0, 233(sp) -; ZVFHMIN64-NEXT: lh a0, 592(sp) -; ZVFHMIN64-NEXT: vmv.x.s a1, v8 -; ZVFHMIN64-NEXT: lh t5, 336(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 1 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a0 -; ZVFHMIN64-NEXT: vmv.x.s s7, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa2, t5 -; ZVFHMIN64-NEXT: feq.h a0, fa3, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa3, a2 -; ZVFHMIN64-NEXT: sb a0, 232(sp) -; ZVFHMIN64-NEXT: lh a0, 590(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa2, a3 -; ZVFHMIN64-NEXT: lh a2, 334(sp) -; ZVFHMIN64-NEXT: feq.h t5, fa5, fa3 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: feq.h t6, fa4, fa2 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s6 -; ZVFHMIN64-NEXT: sb a0, 231(sp) -; ZVFHMIN64-NEXT: lh a0, 588(sp) -; ZVFHMIN64-NEXT: lh a2, 332(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s5 -; ZVFHMIN64-NEXT: sb a0, 230(sp) -; ZVFHMIN64-NEXT: lh a0, 586(sp) -; ZVFHMIN64-NEXT: lh a2, 330(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s8 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s4 -; ZVFHMIN64-NEXT: sb a0, 229(sp) -; ZVFHMIN64-NEXT: lh a0, 584(sp) -; ZVFHMIN64-NEXT: lh a2, 328(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s9 -; ZVFHMIN64-NEXT: feq.h s4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s3 -; ZVFHMIN64-NEXT: sb a0, 228(sp) -; ZVFHMIN64-NEXT: lh a0, 582(sp) -; ZVFHMIN64-NEXT: lh a2, 326(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 -; ZVFHMIN64-NEXT: sb a0, 227(sp) -; ZVFHMIN64-NEXT: lh a0, 580(sp) -; ZVFHMIN64-NEXT: lh a2, 324(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s7 -; ZVFHMIN64-NEXT: feq.h s2, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 226(sp) -; ZVFHMIN64-NEXT: lh a0, 578(sp) +; ZVFHMIN64-NEXT: sb a1, 233(sp) +; ZVFHMIN64-NEXT: lh a1, 592(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 336(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 232(sp) +; ZVFHMIN64-NEXT: lh a1, 590(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 334(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 231(sp) +; ZVFHMIN64-NEXT: lh a1, 588(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 332(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 230(sp) +; ZVFHMIN64-NEXT: lh a1, 586(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 330(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 229(sp) +; ZVFHMIN64-NEXT: lh a1, 584(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 328(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 228(sp) +; ZVFHMIN64-NEXT: lh a1, 582(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 326(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 227(sp) +; ZVFHMIN64-NEXT: lh a1, 580(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 324(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a4 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN64-NEXT: sb a1, 226(sp) +; ZVFHMIN64-NEXT: lh a1, 578(sp) ; ZVFHMIN64-NEXT: lh a2, 322(sp) -; ZVFHMIN64-NEXT: sb s2, 193(sp) -; ZVFHMIN64-NEXT: sb a1, 194(sp) -; ZVFHMIN64-NEXT: sb s4, 195(sp) -; ZVFHMIN64-NEXT: sb a4, 196(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 +; ZVFHMIN64-NEXT: sb t6, 193(sp) +; ZVFHMIN64-NEXT: sb t5, 194(sp) +; ZVFHMIN64-NEXT: sb t4, 195(sp) +; ZVFHMIN64-NEXT: sb t3, 196(sp) +; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a3, 197(sp) -; ZVFHMIN64-NEXT: sb t6, 198(sp) -; ZVFHMIN64-NEXT: sb t5, 199(sp) -; ZVFHMIN64-NEXT: sb a0, 225(sp) -; ZVFHMIN64-NEXT: lh a0, 766(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb t2, 197(sp) +; ZVFHMIN64-NEXT: sb t1, 198(sp) +; ZVFHMIN64-NEXT: sb t0, 199(sp) +; ZVFHMIN64-NEXT: sb a1, 225(sp) +; ZVFHMIN64-NEXT: lh a1, 766(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 510(sp) -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 18 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s s2, v8 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: li a3, 14 -; ZVFHMIN64-NEXT: mul a2, a2, a3 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s t6, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 191(sp) -; ZVFHMIN64-NEXT: lh a0, 764(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 191(sp) +; ZVFHMIN64-NEXT: lh a1, 764(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 508(sp) -; ZVFHMIN64-NEXT: vmv.x.s t5, v6 -; ZVFHMIN64-NEXT: csrr a2, vlenb -; ZVFHMIN64-NEXT: slli a2, a2, 2 -; ZVFHMIN64-NEXT: add a2, sp, a2 -; ZVFHMIN64-NEXT: addi a2, a2, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s a2, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 190(sp) -; ZVFHMIN64-NEXT: lh a0, 762(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 190(sp) +; ZVFHMIN64-NEXT: lh a1, 762(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 506(sp) -; ZVFHMIN64-NEXT: csrr a3, vlenb -; ZVFHMIN64-NEXT: slli a3, a3, 3 -; ZVFHMIN64-NEXT: add a3, sp, a3 -; ZVFHMIN64-NEXT: addi a3, a3, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a3) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s a3, v8 -; ZVFHMIN64-NEXT: csrr a4, vlenb -; ZVFHMIN64-NEXT: li s3, 6 -; ZVFHMIN64-NEXT: mul a4, a4, s3 -; ZVFHMIN64-NEXT: add a4, sp, a4 -; ZVFHMIN64-NEXT: addi a4, a4, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a4) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s a4, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 189(sp) -; ZVFHMIN64-NEXT: lh a0, 760(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 189(sp) +; ZVFHMIN64-NEXT: lh a1, 760(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 504(sp) -; ZVFHMIN64-NEXT: csrr s3, vlenb -; ZVFHMIN64-NEXT: li s4, 12 -; ZVFHMIN64-NEXT: mul s3, s3, s4 -; ZVFHMIN64-NEXT: add s3, sp, s3 -; ZVFHMIN64-NEXT: addi s3, s3, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s s6, v8 -; ZVFHMIN64-NEXT: csrr s3, vlenb -; ZVFHMIN64-NEXT: li s4, 10 -; ZVFHMIN64-NEXT: mul s3, s3, s4 -; ZVFHMIN64-NEXT: add s3, sp, s3 -; ZVFHMIN64-NEXT: addi s3, s3, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s s4, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 188(sp) -; ZVFHMIN64-NEXT: lh a0, 758(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 188(sp) +; ZVFHMIN64-NEXT: lh a1, 758(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 502(sp) -; ZVFHMIN64-NEXT: csrr s3, vlenb -; ZVFHMIN64-NEXT: slli s3, s3, 4 -; ZVFHMIN64-NEXT: add s3, sp, s3 -; ZVFHMIN64-NEXT: addi s3, s3, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (s3) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s s5, v8 -; ZVFHMIN64-NEXT: vmv.x.s s3, v16 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t4 -; ZVFHMIN64-NEXT: sb a0, 187(sp) -; ZVFHMIN64-NEXT: lh a0, 756(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 187(sp) +; ZVFHMIN64-NEXT: lh a1, 756(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 500(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 -; ZVFHMIN64-NEXT: feq.h t4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t3 -; ZVFHMIN64-NEXT: sb a0, 186(sp) -; ZVFHMIN64-NEXT: lh a0, 754(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 186(sp) +; ZVFHMIN64-NEXT: lh a1, 754(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 498(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 -; ZVFHMIN64-NEXT: feq.h t3, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t1 -; ZVFHMIN64-NEXT: sb a0, 185(sp) -; ZVFHMIN64-NEXT: lh a0, 752(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 185(sp) +; ZVFHMIN64-NEXT: lh a1, 752(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 496(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a4 -; ZVFHMIN64-NEXT: feq.h t1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t2 -; ZVFHMIN64-NEXT: sb a0, 184(sp) -; ZVFHMIN64-NEXT: lh a0, 750(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 184(sp) +; ZVFHMIN64-NEXT: lh a1, 750(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 494(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s6 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t0 -; ZVFHMIN64-NEXT: sb a0, 183(sp) -; ZVFHMIN64-NEXT: lh a0, 748(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 183(sp) +; ZVFHMIN64-NEXT: lh a1, 748(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 492(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s4 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a7 -; ZVFHMIN64-NEXT: sb a0, 182(sp) -; ZVFHMIN64-NEXT: lh a0, 746(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 182(sp) +; ZVFHMIN64-NEXT: lh a1, 746(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 490(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s5 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a6 -; ZVFHMIN64-NEXT: sb a0, 181(sp) -; ZVFHMIN64-NEXT: lh a0, 744(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 181(sp) +; ZVFHMIN64-NEXT: lh a1, 744(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: lh a1, 488(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, s3 -; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a5 -; ZVFHMIN64-NEXT: addi a1, sp, 800 -; ZVFHMIN64-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload -; ZVFHMIN64-NEXT: vmv.x.s a1, v8 -; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 15 -; ZVFHMIN64-NEXT: vmv.x.s a5, v8 -; ZVFHMIN64-NEXT: sb a0, 180(sp) -; ZVFHMIN64-NEXT: lh a0, 742(sp) -; ZVFHMIN64-NEXT: lh a7, 486(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: sb a1, 180(sp) +; ZVFHMIN64-NEXT: lh a1, 742(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 +; ZVFHMIN64-NEXT: lh a1, 486(sp) ; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 ; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: sb a0, 179(sp) -; ZVFHMIN64-NEXT: lh a0, 740(sp) -; ZVFHMIN64-NEXT: lh a7, 484(sp) -; ZVFHMIN64-NEXT: sb a2, 140(sp) -; ZVFHMIN64-NEXT: sb t1, 141(sp) -; ZVFHMIN64-NEXT: sb t3, 142(sp) -; ZVFHMIN64-NEXT: sb t4, 143(sp) -; ZVFHMIN64-NEXT: sb a1, 136(sp) -; ZVFHMIN64-NEXT: sb a6, 137(sp) -; ZVFHMIN64-NEXT: sb a4, 138(sp) -; ZVFHMIN64-NEXT: sb a3, 139(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 +; ZVFHMIN64-NEXT: ld a2, 112(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: ld a2, 104(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 +; ZVFHMIN64-NEXT: sb a1, 179(sp) +; ZVFHMIN64-NEXT: lh a2, 740(sp) +; ZVFHMIN64-NEXT: lh a3, 484(sp) +; ZVFHMIN64-NEXT: sb s2, 140(sp) +; ZVFHMIN64-NEXT: sb a6, 141(sp) +; ZVFHMIN64-NEXT: sb a5, 142(sp) +; ZVFHMIN64-NEXT: sb a0, 143(sp) +; ZVFHMIN64-NEXT: sb ra, 136(sp) +; ZVFHMIN64-NEXT: sb s9, 137(sp) +; ZVFHMIN64-NEXT: sb s10, 138(sp) +; ZVFHMIN64-NEXT: sb s11, 139(sp) +; ZVFHMIN64-NEXT: feq.h a1, fa5, fa4 +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a3 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 178(sp) ; ZVFHMIN64-NEXT: lh a0, 638(sp) -; ZVFHMIN64-NEXT: lh a1, 382(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 14 -; ZVFHMIN64-NEXT: vmv.x.s t3, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 382(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 255(sp) ; ZVFHMIN64-NEXT: lh a0, 636(sp) -; ZVFHMIN64-NEXT: lh a1, 380(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 13 -; ZVFHMIN64-NEXT: vmv.x.s t2, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 380(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 254(sp) ; ZVFHMIN64-NEXT: lh a0, 634(sp) -; ZVFHMIN64-NEXT: lh a1, 378(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 12 -; ZVFHMIN64-NEXT: vmv.x.s t1, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 378(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 253(sp) ; ZVFHMIN64-NEXT: lh a0, 632(sp) -; ZVFHMIN64-NEXT: lh a1, 376(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 11 -; ZVFHMIN64-NEXT: vmv.x.s t0, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 376(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 252(sp) ; ZVFHMIN64-NEXT: lh a0, 630(sp) -; ZVFHMIN64-NEXT: lh a1, 374(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 10 -; ZVFHMIN64-NEXT: vmv.x.s a7, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 374(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 251(sp) ; ZVFHMIN64-NEXT: lh a0, 628(sp) -; ZVFHMIN64-NEXT: lh a1, 372(sp) -; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 9 -; ZVFHMIN64-NEXT: vmv.x.s a6, v8 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 372(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 96(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: sb a0, 250(sp) ; ZVFHMIN64-NEXT: lh a0, 626(sp) -; ZVFHMIN64-NEXT: lh a1, 370(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a5 -; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 370(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 104(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: sb a0, 249(sp) ; ZVFHMIN64-NEXT: lh a0, 624(sp) -; ZVFHMIN64-NEXT: lh a1, 368(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t3 -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 368(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 120(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: sb a0, 248(sp) ; ZVFHMIN64-NEXT: lh a0, 622(sp) -; ZVFHMIN64-NEXT: lh a1, 366(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t2 -; ZVFHMIN64-NEXT: feq.h a4, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 366(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 88(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: sb a0, 247(sp) ; ZVFHMIN64-NEXT: lh a0, 620(sp) -; ZVFHMIN64-NEXT: lh a1, 364(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t1 -; ZVFHMIN64-NEXT: feq.h a5, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 364(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: ld a1, 112(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: fmv.h.x fa5, a1 ; ZVFHMIN64-NEXT: sb a0, 246(sp) ; ZVFHMIN64-NEXT: lh a0, 618(sp) -; ZVFHMIN64-NEXT: lh a1, 362(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, t0 -; ZVFHMIN64-NEXT: feq.h t0, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 362(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, s2 ; ZVFHMIN64-NEXT: sb a0, 245(sp) ; ZVFHMIN64-NEXT: lh a0, 616(sp) -; ZVFHMIN64-NEXT: lh a1, 360(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a7 -; ZVFHMIN64-NEXT: feq.h a7, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 360(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t6 ; ZVFHMIN64-NEXT: sb a0, 244(sp) ; ZVFHMIN64-NEXT: lh a0, 614(sp) -; ZVFHMIN64-NEXT: lh a1, 358(sp) -; ZVFHMIN64-NEXT: fmv.h.x fa4, a6 -; ZVFHMIN64-NEXT: feq.h a6, fa5, fa4 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: lh a0, 358(sp) +; ZVFHMIN64-NEXT: fmv.h.x fa4, a0 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 -; ZVFHMIN64-NEXT: fmv.h.x fa5, t5 +; ZVFHMIN64-NEXT: ld a2, 120(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fmv.h.x fa5, a2 +; ZVFHMIN64-NEXT: vsetivli zero, 1, e16, m2, ta, ma ; ZVFHMIN64-NEXT: vslidedown.vi v8, v24, 8 -; ZVFHMIN64-NEXT: vmv.x.s a1, v8 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: vmv.x.s a2, v8 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: sb a0, 243(sp) ; ZVFHMIN64-NEXT: lh a0, 612(sp) -; ZVFHMIN64-NEXT: lh a1, 356(sp) -; ZVFHMIN64-NEXT: sb a5, 204(sp) -; ZVFHMIN64-NEXT: sb a4, 205(sp) -; ZVFHMIN64-NEXT: sb a2, 206(sp) -; ZVFHMIN64-NEXT: sb a3, 207(sp) -; ZVFHMIN64-NEXT: feq.h a2, fa5, fa4 -; ZVFHMIN64-NEXT: sb a2, 200(sp) -; ZVFHMIN64-NEXT: sb a6, 201(sp) -; ZVFHMIN64-NEXT: sb a7, 202(sp) -; ZVFHMIN64-NEXT: sb t0, 203(sp) -; ZVFHMIN64-NEXT: li a2, 128 +; ZVFHMIN64-NEXT: lh a2, 356(sp) +; ZVFHMIN64-NEXT: sb s6, 204(sp) +; ZVFHMIN64-NEXT: sb s8, 205(sp) +; ZVFHMIN64-NEXT: sb s7, 206(sp) +; ZVFHMIN64-NEXT: sb s3, 207(sp) +; ZVFHMIN64-NEXT: feq.h a3, fa5, fa4 +; ZVFHMIN64-NEXT: sb a3, 200(sp) +; ZVFHMIN64-NEXT: sb a1, 201(sp) +; ZVFHMIN64-NEXT: sb a4, 202(sp) +; ZVFHMIN64-NEXT: sb s4, 203(sp) +; ZVFHMIN64-NEXT: li a1, 128 ; ZVFHMIN64-NEXT: fmv.h.x fa5, a0 -; ZVFHMIN64-NEXT: fmv.h.x fa4, a1 +; ZVFHMIN64-NEXT: fmv.h.x fa4, a2 ; ZVFHMIN64-NEXT: feq.h a0, fa5, fa4 ; ZVFHMIN64-NEXT: sb a0, 242(sp) ; ZVFHMIN64-NEXT: addi a0, sp, 128 -; ZVFHMIN64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; ZVFHMIN64-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; ZVFHMIN64-NEXT: vle8.v v8, (a0) ; ZVFHMIN64-NEXT: vand.vi v8, v8, 1 ; ZVFHMIN64-NEXT: vmsne.vi v0, v8, 0 -; ZVFHMIN64-NEXT: addi sp, s0, -896 -; ZVFHMIN64-NEXT: .cfi_def_cfa sp, 896 -; ZVFHMIN64-NEXT: ld ra, 888(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s0, 880(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s2, 872(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s3, 864(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s4, 856(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s5, 848(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s6, 840(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s7, 832(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s8, 824(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s9, 816(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s10, 808(sp) # 8-byte Folded Reload -; ZVFHMIN64-NEXT: ld s11, 800(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, s0, -1024 +; ZVFHMIN64-NEXT: .cfi_def_cfa sp, 1024 +; ZVFHMIN64-NEXT: ld ra, 1016(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s0, 1008(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s2, 1000(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s3, 992(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s4, 984(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s5, 976(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s6, 968(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s7, 960(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s8, 952(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s9, 944(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s10, 936(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: ld s11, 928(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs0, 920(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs1, 912(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs2, 904(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs3, 896(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs4, 888(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs5, 880(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs6, 872(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs7, 864(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs8, 856(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs9, 848(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs10, 840(sp) # 8-byte Folded Reload +; ZVFHMIN64-NEXT: fld fs11, 832(sp) # 8-byte Folded Reload ; ZVFHMIN64-NEXT: .cfi_restore ra ; ZVFHMIN64-NEXT: .cfi_restore s0 ; ZVFHMIN64-NEXT: .cfi_restore s2 @@ -3376,7 +3570,19 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 ; ZVFHMIN64-NEXT: .cfi_restore s9 ; ZVFHMIN64-NEXT: .cfi_restore s10 ; ZVFHMIN64-NEXT: .cfi_restore s11 -; ZVFHMIN64-NEXT: addi sp, sp, 896 +; ZVFHMIN64-NEXT: .cfi_restore fs0 +; ZVFHMIN64-NEXT: .cfi_restore fs1 +; ZVFHMIN64-NEXT: .cfi_restore fs2 +; ZVFHMIN64-NEXT: .cfi_restore fs3 +; ZVFHMIN64-NEXT: .cfi_restore fs4 +; ZVFHMIN64-NEXT: .cfi_restore fs5 +; ZVFHMIN64-NEXT: .cfi_restore fs6 +; ZVFHMIN64-NEXT: .cfi_restore fs7 +; ZVFHMIN64-NEXT: .cfi_restore fs8 +; ZVFHMIN64-NEXT: .cfi_restore fs9 +; ZVFHMIN64-NEXT: .cfi_restore fs10 +; ZVFHMIN64-NEXT: .cfi_restore fs11 +; ZVFHMIN64-NEXT: addi sp, sp, 1024 ; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN64-NEXT: ret %v = call <128 x i1> @llvm.vp.fcmp.v128f16(<128 x half> %va, <128 x half> %vb, metadata !"oeq", <128 x i1> %m, i32 %evl) @@ -3953,20 +4159,20 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB87_2 +; CHECK-NEXT: bltu a2, a3, .LBB87_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB87_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v7, v8, v16, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 @@ -3977,13 +4183,13 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v8, v16, v24, v0.t +; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v7, v8, 2 +; CHECK-NEXT: vslideup.vi v7, v16, 2 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll index 69d6ffa9f300c..81b8b2d5a2c88 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll @@ -592,55 +592,30 @@ declare <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8>, <256 x i8>, metadata, <256 x define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_v256i8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: li a1, 128 ; CHECK-NEXT: addi a4, a0, 128 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: addi a2, a3, -128 -; CHECK-NEXT: vle8.v v8, (a4) +; CHECK-NEXT: vle8.v v24, (a4) ; CHECK-NEXT: sltu a4, a3, a2 -; CHECK-NEXT: vle8.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a4, a4, -1 ; CHECK-NEXT: and a2, a4, a2 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmseq.vv v6, v16, v8, v0.t +; CHECK-NEXT: vmseq.vv v6, v16, v24, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vle8.v v24, (a0) ; CHECK-NEXT: bltu a3, a1, .LBB51_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: .LBB51_2: ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmv1r.v v8, v6 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> %va, <256 x i8> %vb, metadata !"eq", <256 x i1> %m, i32 %evl) ret <256 x i1> %v @@ -652,12 +627,12 @@ define <256 x i1> @icmp_eq_vx_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 z ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a3, 128 +; CHECK-NEXT: addi a4, a2, -128 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a1) -; CHECK-NEXT: addi a1, a2, -128 -; CHECK-NEXT: sltu a4, a2, a1 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a1, a4, a1 +; CHECK-NEXT: sltu a1, a2, a4 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a1, a1, a4 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t ; CHECK-NEXT: bltu a2, a3, .LBB52_2 @@ -682,12 +657,12 @@ define <256 x i1> @icmp_eq_vx_swap_v256i8(<256 x i8> %va, i8 %b, <256 x i1> %m, ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a3, 128 +; CHECK-NEXT: addi a4, a2, -128 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a1) -; CHECK-NEXT: addi a1, a2, -128 -; CHECK-NEXT: sltu a4, a2, a1 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a1, a4, a1 +; CHECK-NEXT: sltu a1, a2, a4 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a1, a1, a4 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t ; CHECK-NEXT: bltu a2, a3, .LBB53_2 @@ -1263,19 +1238,19 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v0, 4 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a1) ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 4 ; CHECK-NEXT: bltu a2, a3, .LBB99_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: .LBB99_2: -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vmseq.vv v7, v8, v16, v0.t ; CHECK-NEXT: addi a0, a2, -32 ; CHECK-NEXT: sltu a1, a2, a0 @@ -1308,9 +1283,9 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_v64i32: ; CHECK: # %bb.0: -; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 4 +; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: bltu a1, a3, .LBB100_2 ; CHECK-NEXT: # %bb.1: @@ -1338,9 +1313,9 @@ define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 ze define <64 x i1> @icmp_eq_vx_swap_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_swap_v64i32: ; CHECK: # %bb.0: -; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 4 +; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: bltu a1, a3, .LBB101_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll index d1980ee3b0a6f..26477edb33adc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sext-vp.ll @@ -151,9 +151,9 @@ declare <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32>, <32 x i1>, i32) define <32 x i64> @vsext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vsext_v32i64_v32i32: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v16, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB12_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll index abbbfe8f252fb..d64b39488023b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll @@ -31,8 +31,8 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) { ; RV32-NEXT: srli a0, a0, 31 ; RV32-NEXT: vslide1down.vx v8, v8, a1 ; RV32-NEXT: vslide1down.vx v9, v9, a2 -; RV32-NEXT: vmv.v.i v0, 15 ; RV32-NEXT: vslide1down.vx v9, v9, a0 +; RV32-NEXT: vmv.v.i v0, 15 ; RV32-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV32-NEXT: vand.vi v8, v8, 1 ; RV32-NEXT: vmsne.vi v0, v8, 0 @@ -65,8 +65,8 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) { ; RV64-NEXT: srli a0, a0, 63 ; RV64-NEXT: vslide1down.vx v8, v8, a1 ; RV64-NEXT: vslide1down.vx v9, v9, a2 -; RV64-NEXT: vmv.v.i v0, 15 ; RV64-NEXT: vslide1down.vx v9, v9, a0 +; RV64-NEXT: vmv.v.i v0, 15 ; RV64-NEXT: vslidedown.vi v8, v9, 4, v0.t ; RV64-NEXT: vand.vi v8, v8, 1 ; RV64-NEXT: vmsne.vi v0, v8, 0 @@ -80,13 +80,13 @@ define <4 x i32> @v4i32_v8i32(<8 x i32>) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vmv.v.i v0, 5 ; CHECK-NEXT: vsrl.vi v10, v10, 1 ; CHECK-NEXT: vrsub.vi v11, v10, 3 ; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 4 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 5 ; CHECK-NEXT: vslidedown.vi v10, v8, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -156,15 +156,15 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) { ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32-NEXT: vslidedown.vi v8, v8, 4 -; RV32-NEXT: lw a0, 36(sp) -; RV32-NEXT: vmv.x.s a1, v16 +; RV32-NEXT: vmv.x.s a0, v16 +; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 -; RV32-NEXT: lw a1, 120(sp) -; RV32-NEXT: vslide1down.vx v9, v9, a0 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: vslide1down.vx v8, v9, a0 +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: lw a0, 36(sp) +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: lw a0, 120(sp) +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: addi sp, s0, -256 ; RV32-NEXT: .cfi_def_cfa sp, 256 ; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload @@ -194,15 +194,15 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) { ; RV64-NEXT: vse32.v v8, (a1) ; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 4 -; RV64-NEXT: lw a0, 36(sp) -; RV64-NEXT: vmv.x.s a1, v16 +; RV64-NEXT: vmv.x.s a0, v16 +; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a1 -; RV64-NEXT: lw a1, 120(sp) -; RV64-NEXT: vslide1down.vx v9, v9, a0 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: vslide1down.vx v8, v9, a0 +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: lw a0, 36(sp) +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: vslide1down.vx v8, v8, a1 +; RV64-NEXT: lw a0, 120(sp) +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: addi sp, s0, -256 ; RV64-NEXT: .cfi_def_cfa sp, 256 ; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload @@ -219,13 +219,13 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) { define <16 x i1> @v16i1_v8i1(<8 x i1>) { ; CHECK-LABEL: v16i1_v8i1: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 -; CHECK-NEXT: vrgather.vv v10, v9, v8 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vrgather.vv v10, v8, v9 ; CHECK-NEXT: vmsne.vi v0, v10, 0 ; CHECK-NEXT: ret %2 = shufflevector <8 x i1> %0, <8 x i1> poison, <16 x i32> @@ -235,12 +235,17 @@ define <16 x i1> @v16i1_v8i1(<8 x i1>) { define <8 x i32> @v8i32_v4i32(<4 x i32>) { ; CHECK-LABEL: v8i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: lui a1, %hi(.LCPI5_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI5_0) +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vslidedown.vx v10, v9, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v10, v8, v9 +; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <8 x i32> ret <8 x i32> %2 @@ -249,30 +254,40 @@ define <8 x i32> @v8i32_v4i32(<4 x i32>) { define <16 x i32> @v16i32_v4i32(<4 x i32>) { ; CHECK-LABEL: v16i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 2 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v9, 3 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: lui a0, 2 +; CHECK-NEXT: vmv.v.i v10, 3 ; CHECK-NEXT: addi a1, a0, 265 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 ; CHECK-NEXT: lui a1, 4 ; CHECK-NEXT: addi a1, a1, 548 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v9, v9, 2, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: addi a0, a0, -1856 +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v9, v9, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v9, v9, 1, v0 +; CHECK-NEXT: vmerge.vim v10, v10, 2, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vsext.vf2 v16, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vslidedown.vx v14, v10, a1 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v9, v12, v14 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v14, v14, a1 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v8, v12, v10 +; CHECK-NEXT: vrgatherei16.vv v10, v12, v14 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v14, v14, a1 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v12, v14 ; CHECK-NEXT: ret %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <16 x i32> ret <16 x i32> %2 @@ -290,22 +305,48 @@ define <32 x i32> @v32i32_v4i32(<4 x i32>) { ; CHECK-NEXT: addi a1, a1, 548 ; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: lui a1, 100550 +; CHECK-NEXT: addi a1, a1, 64 +; CHECK-NEXT: vmv.s.x v10, a1 +; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v10, 3 -; CHECK-NEXT: addi a0, a1, 64 -; CHECK-NEXT: vmerge.vim v18, v10, 2, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmv.s.x v16, a0 +; CHECK-NEXT: vmv.v.i v12, 3 +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: vmerge.vim v12, v12, 2, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vmerge.vim v18, v18, 0, v0 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vmerge.vim v16, v18, 1, v0 +; CHECK-NEXT: vmerge.vim v12, v12, 0, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vim v10, v12, 1, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vsext.vf2 v24, v16 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v16, v8, v24 -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsext.vf2 v12, v10 +; CHECK-NEXT: vslidedown.vx v20, v12, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v17, v8, v20 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v20, v20, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v12, v20, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v18, v8, v20 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v24, v12, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v19, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v12, v24, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v20, v8, v24 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v24, v12, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v21, v8, v12 +; CHECK-NEXT: vrgatherei16.vv v22, v8, v24 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v12, v24, a1 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v23, v8, v12 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: ret %2 = shufflevector <4 x i32> %0, <4 x i32> poison, <32 x i32> ret <32 x i32> %2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll index f2353e7d028bd..5c2d61138df13 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll @@ -185,9 +185,9 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x ; VLA-NEXT: vmv2r.v v20, v14 ; VLA-NEXT: vmv2r.v v16, v12 ; VLA-NEXT: vmv2r.v v12, v10 -; VLA-NEXT: li a0, 32 ; VLA-NEXT: vslideup.vi v16, v20, 8 ; VLA-NEXT: vslideup.vi v8, v12, 8 +; VLA-NEXT: li a0, 32 ; VLA-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; VLA-NEXT: vslideup.vi v8, v16, 16 ; VLA-NEXT: ret @@ -212,7 +212,6 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x ; VLA-NEXT: vmv1r.v v22, v11 ; VLA-NEXT: vmv1r.v v12, v10 ; VLA-NEXT: vmv1r.v v10, v9 -; VLA-NEXT: li a0, 32 ; VLA-NEXT: vslideup.vi v20, v18, 4 ; VLA-NEXT: vslideup.vi v16, v14, 4 ; VLA-NEXT: vslideup.vi v12, v22, 4 @@ -220,6 +219,7 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x ; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; VLA-NEXT: vslideup.vi v16, v20, 8 ; VLA-NEXT: vslideup.vi v8, v12, 8 +; VLA-NEXT: li a0, 32 ; VLA-NEXT: vsetvli zero, a0, e32, m8, ta, ma ; VLA-NEXT: vslideup.vi v8, v16, 16 ; VLA-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll index 10dadbc022e02..140d1450e1e5c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll @@ -11,16 +11,15 @@ define void @deinterleave3_0_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI0_0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: li a0, 73 ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: lui a0, %hi(.LCPI0_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI0_0) ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 8 +; CHECK-NEXT: vslidedown.vi v9, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vrgather.vv v10, v8, v9 ; CHECK-NEXT: vse8.v v10, (a1) ; CHECK-NEXT: ret @@ -36,16 +35,15 @@ define void @deinterleave3_8_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI1_0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: li a0, 146 ; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: lui a0, %hi(.LCPI1_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI1_0) ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 8 +; CHECK-NEXT: vslidedown.vi v9, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vrgather.vv v10, v8, v9 ; CHECK-NEXT: vse8.v v10, (a1) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index c0c17d4e0623e..0b7a50912b447 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -186,9 +186,9 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) { ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vmv.v.i v0, 5 ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v11, (a0) -; CHECK-NEXT: vmv.v.i v0, 5 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vsrl.vi v10, v10, 1 ; CHECK-NEXT: vadd.vi v10, v10, 1 @@ -210,14 +210,14 @@ define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) { ; CHECK-LABEL: shuffle2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v8 -; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv.v.i v12, 0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vid.v v13 -; CHECK-NEXT: vadd.vv v13, v13, v13 +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vadd.vv v9, v9, v9 +; CHECK-NEXT: vrsub.vi v9, v9, 4 ; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vrsub.vi v13, v13, 4 -; CHECK-NEXT: vrgather.vv v9, v12, v13, v0.t +; CHECK-NEXT: vrgather.vv v13, v8, v9, v0.t +; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %b = extractelement <4 x float> %a, i32 2 %c = insertelement <16 x float> , float %b, i32 5 @@ -255,11 +255,10 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca ; RV64-NEXT: addi s0, sp, 256 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -128 -; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV64-NEXT: vmv.v.i v0, 1 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vmv.v.i v16, 0 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vmv.v.i v0, 1 ; RV64-NEXT: vrgather.vi v18, v15, 1, v0.t ; RV64-NEXT: mv s2, sp ; RV64-NEXT: vs8r.v v16, (s2) @@ -291,9 +290,9 @@ define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmv1r.v v13, v10 ; CHECK-NEXT: vslideup.vi v13, v11, 1 +; CHECK-NEXT: vrgather.vi v12, v9, 0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: vmv.v.i v0, 1 -; CHECK-NEXT: vrgather.vi v12, v9, 0 ; CHECK-NEXT: vmv1r.v v9, v11 ; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma @@ -326,8 +325,8 @@ define <16 x i32> @m4_linear_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_ran ; CHECK-LABEL: m4_linear_num_of_shuffles_in_chunks: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vrgather.vi v12, v10, 0 +; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vrgather.vi v12, v11, 0, v0.t ; CHECK-NEXT: vrgather.vi v14, v8, 2 ; CHECK-NEXT: vrgather.vi v15, v10, 3 @@ -348,16 +347,18 @@ define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) { ; RV32-NEXT: vwsubu.vx v12, v10, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vand.vx v12, v12, a1 ; RV32-NEXT: vand.vx v10, v10, a1 ; RV32-NEXT: vsrl.vv v12, v8, v12 ; RV32-NEXT: vsll.vv v8, v8, v10 -; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: vsetivli zero, 16, e64, m2, ta, ma +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV32-NEXT: vrgather.vi v10, v8, 2 ; RV32-NEXT: vor.vv v8, v8, v10 @@ -373,12 +374,12 @@ define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) { ; RV64-NEXT: vsetivli zero, 16, e64, m2, ta, ma ; RV64-NEXT: vsrl.vx v10, v8, a0 ; RV64-NEXT: vsll.vx v8, v8, a0 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addi a0, a0, -241 ; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: addi a0, a0, -241 +; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV64-NEXT: vrgather.vi v10, v8, 2 ; RV64-NEXT: vor.vv v8, v8, v10 @@ -437,11 +438,9 @@ define void @shuffle_3_input_vectors() vscale_range(4,4) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vmv.v.i v8, 1 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vmv.v.i v16, 0 ; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 6 ; CHECK-NEXT: vslidedown.vi v20, v8, 1, v0.t ; CHECK-NEXT: vslideup.vi v20, v9, 3 ; CHECK-NEXT: vslidedown.vi v21, v9, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll index c222626a166fe..eb0ee5773962b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -1161,8 +1161,8 @@ define <32 x half> @reverse_v32f16_2(<16 x half> %a) { ; CHECK-NEXT: vrgather.vv v10, v9, v12 ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: vrgather.vv v8, v9, v12 ; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: vrgather.vv v8, v9, v12 ; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v8, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll index 86d8a275a9055..c9fe39685fbc6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -969,11 +969,44 @@ define <8 x i64> @shuffle_v8i64_as_i128(<8 x i64> %v) { ret <8 x i64> %shuffle } -define <8 x i64> @shuffle_v8i64_as_i256(<8 x i64> %v) { -; CHECK-LABEL: shuffle_v8i64_as_i256: +; Test case where first span has undefs +define <8 x i64> @shuffle_v8i64_as_i128_2(<8 x i64> %v) { +; CHECK-LABEL: shuffle_v8i64_as_i128_2: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI30_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v13, v9, v16 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 +; CHECK-NEXT: vrgatherei16.vv v14, v10, v16 +; CHECK-NEXT: vrgatherei16.vv v15, v11, v16 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret +; +; ZVKB-V-LABEL: shuffle_v8i64_as_i128_2: +; ZVKB-V: # %bb.0: +; ZVKB-V-NEXT: lui a0, %hi(.LCPI30_0) +; ZVKB-V-NEXT: addi a0, a0, %lo(.LCPI30_0) +; ZVKB-V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVKB-V-NEXT: vle16.v v16, (a0) +; ZVKB-V-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVKB-V-NEXT: vrgatherei16.vv v13, v9, v16 +; ZVKB-V-NEXT: vrgatherei16.vv v12, v8, v16 +; ZVKB-V-NEXT: vrgatherei16.vv v14, v10, v16 +; ZVKB-V-NEXT: vrgatherei16.vv v15, v11, v16 +; ZVKB-V-NEXT: vmv4r.v v8, v12 +; ZVKB-V-NEXT: ret + %shuffle = shufflevector <8 x i64> %v, <8 x i64> poison, <8 x i32> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_as_i256(<8 x i64> %v) { +; CHECK-LABEL: shuffle_v8i64_as_i256: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI31_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0) ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 @@ -982,8 +1015,8 @@ define <8 x i64> @shuffle_v8i64_as_i256(<8 x i64> %v) { ; ; ZVKB-V-LABEL: shuffle_v8i64_as_i256: ; ZVKB-V: # %bb.0: -; ZVKB-V-NEXT: lui a0, %hi(.LCPI30_0) -; ZVKB-V-NEXT: addi a0, a0, %lo(.LCPI30_0) +; ZVKB-V-NEXT: lui a0, %hi(.LCPI31_0) +; ZVKB-V-NEXT: addi a0, a0, %lo(.LCPI31_0) ; ZVKB-V-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; ZVKB-V-NEXT: vle16.v v16, (a0) ; ZVKB-V-NEXT: vrgatherei16.vv v12, v8, v16 @@ -996,8 +1029,8 @@ define <8 x i64> @shuffle_v8i64_as_i256(<8 x i64> %v) { define <8 x i64> @shuffle_v8i64_as_i256_zvl256b(<8 x i64> %v) vscale_range(4,0) { ; CHECK-LABEL: shuffle_v8i64_as_i256_zvl256b: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI31_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0) +; CHECK-NEXT: lui a0, %hi(.LCPI32_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI32_0) ; CHECK-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma @@ -1008,8 +1041,8 @@ define <8 x i64> @shuffle_v8i64_as_i256_zvl256b(<8 x i64> %v) vscale_range(4,0) ; ; ZVKB-V-LABEL: shuffle_v8i64_as_i256_zvl256b: ; ZVKB-V: # %bb.0: -; ZVKB-V-NEXT: lui a0, %hi(.LCPI31_0) -; ZVKB-V-NEXT: addi a0, a0, %lo(.LCPI31_0) +; ZVKB-V-NEXT: lui a0, %hi(.LCPI32_0) +; ZVKB-V-NEXT: addi a0, a0, %lo(.LCPI32_0) ; ZVKB-V-NEXT: vsetivli zero, 8, e16, mf2, ta, ma ; ZVKB-V-NEXT: vle16.v v12, (a0) ; ZVKB-V-NEXT: vsetvli a0, zero, e64, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll index 3c28e978842b9..72a62627755dd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -179,9 +179,9 @@ define void @vnsrl_32_i32(ptr %in, ptr %out) { ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vmv.v.i v0, 1 ; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; ZVE32F-NEXT: vmv.v.i v0, 1 ; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t ; ZVE32F-NEXT: vse32.v v9, (a1) ; ZVE32F-NEXT: ret @@ -233,9 +233,9 @@ define void @vnsrl_32_float(ptr %in, ptr %out) { ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vmv.v.i v0, 1 ; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; ZVE32F-NEXT: vmv.v.i v0, 1 ; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t ; ZVE32F-NEXT: vse32.v v9, (a1) ; ZVE32F-NEXT: ret @@ -276,9 +276,9 @@ define void @vnsrl_64_i64(ptr %in, ptr %out) { ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vmv.v.i v0, 1 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; V-NEXT: vslidedown.vi v9, v8, 2 +; V-NEXT: vmv.v.i v0, 1 ; V-NEXT: vrgather.vi v9, v8, 1, v0.t ; V-NEXT: vse64.v v9, (a1) ; V-NEXT: ret @@ -327,9 +327,9 @@ define void @vnsrl_64_double(ptr %in, ptr %out) { ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vmv.v.i v0, 1 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; V-NEXT: vslidedown.vi v9, v8, 2 +; V-NEXT: vmv.v.i v0, 1 ; V-NEXT: vrgather.vi v9, v8, 1, v0.t ; V-NEXT: vse64.v v9, (a1) ; V-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll index a2d41de5d1853..ba3b994de46f8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll @@ -390,9 +390,9 @@ declare <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64>, <32 x i1>, i32) define <32 x double> @vsitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vsitofp_v32f64_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB25_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store-merge-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store-merge-crash.ll index 391117c72ece7..3a3d417868dfe 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store-merge-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store-merge-crash.ll @@ -14,8 +14,8 @@ define void @baz() nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a0, %hi(foo) ; CHECK-NEXT: addi a1, a0, %lo(foo) -; CHECK-NEXT: lw a1, 4(a1) ; CHECK-NEXT: lw a0, %lo(foo)(a0) +; CHECK-NEXT: lw a1, 4(a1) ; CHECK-NEXT: lui a2, %hi(bar) ; CHECK-NEXT: sw a1, %lo(bar)(a2) ; CHECK-NEXT: addi a1, a2, %lo(bar) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll index 29d9a8a9b060c..0510cac7ffd0e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll @@ -638,10 +638,10 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: .LBB12_1: # %bb2 ; V-NEXT: # =>This Inner Loop Header: Depth=1 -; V-NEXT: vlse64.v v8, (a1), a3 -; V-NEXT: addi a4, a1, 80 -; V-NEXT: vlse64.v v9, (a4), a3 ; V-NEXT: addi a4, a0, 16 +; V-NEXT: addi a5, a1, 80 +; V-NEXT: vlse64.v v8, (a1), a3 +; V-NEXT: vlse64.v v9, (a5), a3 ; V-NEXT: vse64.v v8, (a0) ; V-NEXT: addi a0, a0, 32 ; V-NEXT: vse64.v v9, (a4) @@ -662,6 +662,7 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur ; ZVE32F-NEXT: mul a6, a3, a5 ; ZVE32F-NEXT: mul a7, a2, a5 ; ZVE32F-NEXT: addi a2, a2, 4 +; ZVE32F-NEXT: addi a3, a3, 4 ; ZVE32F-NEXT: add a6, a1, a6 ; ZVE32F-NEXT: add a7, a1, a7 ; ZVE32F-NEXT: ld t0, 0(a7) @@ -673,7 +674,6 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur ; ZVE32F-NEXT: sd a7, 16(a0) ; ZVE32F-NEXT: sd a6, 24(a0) ; ZVE32F-NEXT: addi a0, a0, 32 -; ZVE32F-NEXT: addi a3, a3, 4 ; ZVE32F-NEXT: bne a0, a4, .LBB12_1 ; ZVE32F-NEXT: # %bb.2: # %bb18 ; ZVE32F-NEXT: ret @@ -686,10 +686,10 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur ; OPTZVE32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; OPTZVE32F-NEXT: .LBB12_1: # %bb2 ; OPTZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 -; OPTZVE32F-NEXT: vlse64.v v8, (a1), a3 -; OPTZVE32F-NEXT: addi a4, a1, 80 -; OPTZVE32F-NEXT: vlse64.v v9, (a4), a3 ; OPTZVE32F-NEXT: addi a4, a0, 16 +; OPTZVE32F-NEXT: addi a5, a1, 80 +; OPTZVE32F-NEXT: vlse64.v v8, (a1), a3 +; OPTZVE32F-NEXT: vlse64.v v9, (a5), a3 ; OPTZVE32F-NEXT: vse64.v v8, (a0) ; OPTZVE32F-NEXT: addi a0, a0, 32 ; OPTZVE32F-NEXT: vse64.v v9, (a4) @@ -710,6 +710,7 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur ; OPTV-NEXT: mul a6, a3, a5 ; OPTV-NEXT: mul a7, a2, a5 ; OPTV-NEXT: addi a2, a2, 4 +; OPTV-NEXT: addi a3, a3, 4 ; OPTV-NEXT: add a6, a1, a6 ; OPTV-NEXT: add a7, a1, a7 ; OPTV-NEXT: ld t0, 0(a7) @@ -721,7 +722,6 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur ; OPTV-NEXT: sd a7, 16(a0) ; OPTV-NEXT: sd a6, 24(a0) ; OPTV-NEXT: addi a0, a0, 32 -; OPTV-NEXT: addi a3, a3, 4 ; OPTV-NEXT: bne a0, a4, .LBB12_1 ; OPTV-NEXT: # %bb.2: # %bb18 ; OPTV-NEXT: ret @@ -791,14 +791,14 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu ; ZVE32F-NEXT: mul t2, a3, a5 ; ZVE32F-NEXT: mul t3, a2, a5 ; ZVE32F-NEXT: addi a2, a2, 4 -; ZVE32F-NEXT: addi a1, a1, 32 +; ZVE32F-NEXT: addi a3, a3, 4 ; ZVE32F-NEXT: add t2, a0, t2 ; ZVE32F-NEXT: add t3, a0, t3 ; ZVE32F-NEXT: sd a6, 0(t3) ; ZVE32F-NEXT: sd a7, 0(t2) ; ZVE32F-NEXT: sd t0, 80(t3) ; ZVE32F-NEXT: sd t1, 80(t2) -; ZVE32F-NEXT: addi a3, a3, 4 +; ZVE32F-NEXT: addi a1, a1, 32 ; ZVE32F-NEXT: bne a1, a4, .LBB13_1 ; ZVE32F-NEXT: # %bb.2: # %bb18 ; ZVE32F-NEXT: ret @@ -839,14 +839,14 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu ; OPTV-NEXT: mul t2, a3, a5 ; OPTV-NEXT: mul t3, a2, a5 ; OPTV-NEXT: addi a2, a2, 4 -; OPTV-NEXT: addi a1, a1, 32 +; OPTV-NEXT: addi a3, a3, 4 ; OPTV-NEXT: add t2, a0, t2 ; OPTV-NEXT: add t3, a0, t3 ; OPTV-NEXT: sd a6, 0(t3) ; OPTV-NEXT: sd a7, 0(t2) ; OPTV-NEXT: sd t0, 80(t3) ; OPTV-NEXT: sd t1, 80(t2) -; OPTV-NEXT: addi a3, a3, 4 +; OPTV-NEXT: addi a1, a1, 32 ; OPTV-NEXT: bne a1, a4, .LBB13_1 ; OPTV-NEXT: # %bb.2: # %bb18 ; OPTV-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll index 4b7f82f94f5e4..fe86344ec73fb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll @@ -609,11 +609,11 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV32-NEXT: .LBB47_2: ; CHECK-RV32-NEXT: mul a6, a3, a2 ; CHECK-RV32-NEXT: addi a5, a4, -32 +; CHECK-RV32-NEXT: add a6, a1, a6 ; CHECK-RV32-NEXT: sltu a7, a4, a5 ; CHECK-RV32-NEXT: addi a7, a7, -1 ; CHECK-RV32-NEXT: and a7, a7, a5 ; CHECK-RV32-NEXT: li a5, 16 -; CHECK-RV32-NEXT: add a6, a1, a6 ; CHECK-RV32-NEXT: bltu a7, a5, .LBB47_4 ; CHECK-RV32-NEXT: # %bb.3: ; CHECK-RV32-NEXT: li a7, 16 @@ -636,16 +636,16 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV32-NEXT: add a5, a1, a5 ; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-RV32-NEXT: vlse64.v v24, (a5), a2, v0.t +; CHECK-RV32-NEXT: addi a3, a0, 128 ; CHECK-RV32-NEXT: vmv1r.v v0, v8 ; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-RV32-NEXT: vlse64.v v8, (a1), a2, v0.t -; CHECK-RV32-NEXT: addi a1, a0, 128 -; CHECK-RV32-NEXT: addi a2, a0, 256 +; CHECK-RV32-NEXT: addi a1, a0, 256 ; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-RV32-NEXT: vse64.v v8, (a0) -; CHECK-RV32-NEXT: vse64.v v24, (a1) +; CHECK-RV32-NEXT: vse64.v v24, (a3) ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vse64.v v16, (a2) +; CHECK-RV32-NEXT: vse64.v v16, (a1) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: strided_load_v33f64: @@ -660,11 +660,11 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV64-NEXT: .LBB47_2: ; CHECK-RV64-NEXT: mul a6, a4, a2 ; CHECK-RV64-NEXT: addi a5, a3, -32 +; CHECK-RV64-NEXT: add a6, a1, a6 ; CHECK-RV64-NEXT: sltu a7, a3, a5 ; CHECK-RV64-NEXT: addi a7, a7, -1 ; CHECK-RV64-NEXT: and a7, a7, a5 ; CHECK-RV64-NEXT: li a5, 16 -; CHECK-RV64-NEXT: add a6, a1, a6 ; CHECK-RV64-NEXT: bltu a7, a5, .LBB47_4 ; CHECK-RV64-NEXT: # %bb.3: ; CHECK-RV64-NEXT: li a7, 16 @@ -687,16 +687,16 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV64-NEXT: add a5, a1, a5 ; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-RV64-NEXT: vlse64.v v24, (a5), a2, v0.t +; CHECK-RV64-NEXT: addi a4, a0, 128 ; CHECK-RV64-NEXT: vmv1r.v v0, v8 ; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-RV64-NEXT: vlse64.v v8, (a1), a2, v0.t -; CHECK-RV64-NEXT: addi a1, a0, 128 -; CHECK-RV64-NEXT: addi a2, a0, 256 +; CHECK-RV64-NEXT: addi a1, a0, 256 ; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-RV64-NEXT: vse64.v v8, (a0) -; CHECK-RV64-NEXT: vse64.v v24, (a1) +; CHECK-RV64-NEXT: vse64.v v24, (a4) ; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV64-NEXT: vse64.v v16, (a2) +; CHECK-RV64-NEXT: vse64.v v16, (a1) ; CHECK-RV64-NEXT: ret %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl) ret <33 x double> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll index 7ca329835b7ac..733c850d64011 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll @@ -472,9 +472,9 @@ define void @strided_store_v32f64(<32 x double> %v, ptr %ptr, i32 signext %strid ; CHECK-NEXT: addi a3, a2, -16 ; CHECK-NEXT: sltu a2, a2, a3 ; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 2 -; CHECK-NEXT: and a2, a2, a3 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v16, (a0), a1, v0.t ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll index a91dee1cb245f..dd5630e165f19 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll @@ -55,8 +55,8 @@ define <128 x i7> @vtrunc_v128i7_v128i16(<128 x i16> %a, <128 x i1> %m, i32 zero ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vmv8r.v v24, v8 -; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vslidedown.vi v12, v0, 8 +; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: mv a2, a0 ; CHECK-NEXT: bltu a0, a1, .LBB4_2 ; CHECK-NEXT: # %bb.1: @@ -245,64 +245,64 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vslidedown.vi v6, v0, 8 -; CHECK-NEXT: addi a2, a1, 512 -; CHECK-NEXT: addi a3, a1, 640 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v26, v0, 4 +; CHECK-NEXT: addi a3, a1, 128 +; CHECK-NEXT: addi a2, a1, 640 ; CHECK-NEXT: addi a4, a7, -64 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v8, (a2) +; CHECK-NEXT: sltu a2, a7, a4 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a4, a2, a4 +; CHECK-NEXT: addi a2, a4, -32 +; CHECK-NEXT: sltu a5, a4, a2 +; CHECK-NEXT: addi a5, a5, -1 +; CHECK-NEXT: and a5, a5, a2 +; CHECK-NEXT: addi a2, a5, -16 +; CHECK-NEXT: sltu a6, a5, a2 +; CHECK-NEXT: addi a6, a6, -1 +; CHECK-NEXT: and a2, a6, a2 +; CHECK-NEXT: addi a6, a1, 512 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v27, v6, 4 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a3) -; CHECK-NEXT: sltu a3, a7, a4 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v27, 2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a4, a3, a4 -; CHECK-NEXT: addi a3, a4, -32 -; CHECK-NEXT: sltu a5, a4, a3 -; CHECK-NEXT: addi a5, a5, -1 -; CHECK-NEXT: and a3, a5, a3 -; CHECK-NEXT: addi a5, a3, -16 -; CHECK-NEXT: sltu a6, a3, a5 -; CHECK-NEXT: addi a6, a6, -1 -; CHECK-NEXT: and a5, a6, a5 -; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a5, a5, 4 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a2) -; CHECK-NEXT: addi a5, a1, 128 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v26, v7, 4 -; CHECK-NEXT: bltu a3, a2, .LBB16_2 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v8, (a6) +; CHECK-NEXT: bltu a5, a2, .LBB16_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: li a5, 16 ; CHECK-NEXT: .LBB16_2: ; CHECK-NEXT: vmv1r.v v0, v27 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a5) -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: li a6, 56 -; CHECK-NEXT: mul a5, a5, a6 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v27, v26, 2 -; CHECK-NEXT: li a5, 64 -; CHECK-NEXT: vsetvli zero, a3, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a3) +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: li a6, 56 +; CHECK-NEXT: mul a3, a3, a6 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v16, v8, 0, v0.t ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: slli a3, a3, 6 ; CHECK-NEXT: add a3, sp, a3 ; CHECK-NEXT: addi a3, a3, 16 ; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: li a3, 64 ; CHECK-NEXT: mv a6, a7 -; CHECK-NEXT: bltu a7, a5, .LBB16_4 +; CHECK-NEXT: bltu a7, a3, .LBB16_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a6, 64 ; CHECK-NEXT: .LBB16_4: @@ -343,13 +343,13 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: li a6, 16 ; CHECK-NEXT: .LBB16_6: ; CHECK-NEXT: vmv1r.v v0, v26 +; CHECK-NEXT: addi a1, a1, 256 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v26, v6, 2 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a5) ; CHECK-NEXT: addi a5, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a1, 256 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v26, v6, 2 ; CHECK-NEXT: csrr a5, vlenb ; CHECK-NEXT: li t0, 48 ; CHECK-NEXT: mul a5, a5, t0 @@ -369,13 +369,13 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; CHECK-NEXT: # %bb.7: ; CHECK-NEXT: li a5, 32 ; CHECK-NEXT: .LBB16_8: +; CHECK-NEXT: vmv1r.v v0, v26 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) ; CHECK-NEXT: addi a1, a5, -16 ; CHECK-NEXT: sltu a5, a5, a1 ; CHECK-NEXT: addi a5, a5, -1 ; CHECK-NEXT: and a1, a5, a1 -; CHECK-NEXT: vmv1r.v v0, v26 ; CHECK-NEXT: addi a5, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma @@ -543,8 +543,8 @@ define <32 x i32> @vtrunc_v32i32_v32i64(<32 x i64> %a, <32 x i1> %m, i32 zeroext ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vmv8r.v v24, v8 -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v12, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB17_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll index a0d5d2ccc848d..32aeb6300d17d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll @@ -390,9 +390,9 @@ declare <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64>, <32 x i1>, i32) define <32 x double> @vuitofp_v32f64_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vuitofp_v32f64_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB25_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll index 6d9f69f436fc4..8e7f6666fb4ab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -84,10 +84,10 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> % ; RV32-SLOW-NEXT: # %bb.1: # %cond.load ; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-SLOW-NEXT: vmv.x.s a1, v8 -; RV32-SLOW-NEXT: lbu a2, 1(a1) -; RV32-SLOW-NEXT: lbu a1, 0(a1) -; RV32-SLOW-NEXT: slli a2, a2, 8 -; RV32-SLOW-NEXT: or a1, a2, a1 +; RV32-SLOW-NEXT: lbu a2, 0(a1) +; RV32-SLOW-NEXT: lbu a1, 1(a1) +; RV32-SLOW-NEXT: slli a1, a1, 8 +; RV32-SLOW-NEXT: or a1, a1, a2 ; RV32-SLOW-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV32-SLOW-NEXT: vmv.s.x v9, a1 ; RV32-SLOW-NEXT: .LBB4_2: # %else @@ -97,10 +97,10 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> % ; RV32-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 1 ; RV32-SLOW-NEXT: vmv.x.s a0, v8 -; RV32-SLOW-NEXT: lbu a1, 1(a0) -; RV32-SLOW-NEXT: lbu a0, 0(a0) -; RV32-SLOW-NEXT: slli a1, a1, 8 -; RV32-SLOW-NEXT: or a0, a1, a0 +; RV32-SLOW-NEXT: lbu a1, 0(a0) +; RV32-SLOW-NEXT: lbu a0, 1(a0) +; RV32-SLOW-NEXT: slli a0, a0, 8 +; RV32-SLOW-NEXT: or a0, a0, a1 ; RV32-SLOW-NEXT: vmv.s.x v8, a0 ; RV32-SLOW-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; RV32-SLOW-NEXT: vslideup.vi v9, v8, 1 @@ -118,10 +118,10 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> % ; RV64-SLOW-NEXT: # %bb.1: # %cond.load ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-SLOW-NEXT: vmv.x.s a1, v8 -; RV64-SLOW-NEXT: lbu a2, 1(a1) -; RV64-SLOW-NEXT: lbu a1, 0(a1) -; RV64-SLOW-NEXT: slli a2, a2, 8 -; RV64-SLOW-NEXT: or a1, a2, a1 +; RV64-SLOW-NEXT: lbu a2, 0(a1) +; RV64-SLOW-NEXT: lbu a1, 1(a1) +; RV64-SLOW-NEXT: slli a1, a1, 8 +; RV64-SLOW-NEXT: or a1, a1, a2 ; RV64-SLOW-NEXT: vsetvli zero, zero, e16, m2, tu, ma ; RV64-SLOW-NEXT: vmv.s.x v9, a1 ; RV64-SLOW-NEXT: .LBB4_2: # %else @@ -131,10 +131,10 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> % ; RV64-SLOW-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1 ; RV64-SLOW-NEXT: vmv.x.s a0, v8 -; RV64-SLOW-NEXT: lbu a1, 1(a0) -; RV64-SLOW-NEXT: lbu a0, 0(a0) -; RV64-SLOW-NEXT: slli a1, a1, 8 -; RV64-SLOW-NEXT: or a0, a1, a0 +; RV64-SLOW-NEXT: lbu a1, 0(a0) +; RV64-SLOW-NEXT: lbu a0, 1(a0) +; RV64-SLOW-NEXT: slli a0, a0, 8 +; RV64-SLOW-NEXT: or a0, a0, a1 ; RV64-SLOW-NEXT: vmv.s.x v8, a0 ; RV64-SLOW-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; RV64-SLOW-NEXT: vslideup.vi v9, v8, 1 @@ -204,10 +204,10 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> % ; RV64-SLOW-NEXT: # %bb.1: # %cond.load ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV64-SLOW-NEXT: vmv.x.s a1, v8 -; RV64-SLOW-NEXT: lwu a2, 4(a1) -; RV64-SLOW-NEXT: lwu a1, 0(a1) -; RV64-SLOW-NEXT: slli a2, a2, 32 -; RV64-SLOW-NEXT: or a1, a2, a1 +; RV64-SLOW-NEXT: lwu a2, 0(a1) +; RV64-SLOW-NEXT: lwu a1, 4(a1) +; RV64-SLOW-NEXT: slli a1, a1, 32 +; RV64-SLOW-NEXT: or a1, a1, a2 ; RV64-SLOW-NEXT: vmv.s.x v9, a1 ; RV64-SLOW-NEXT: .LBB5_2: # %else ; RV64-SLOW-NEXT: andi a0, a0, 2 @@ -216,10 +216,10 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> % ; RV64-SLOW-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1 ; RV64-SLOW-NEXT: vmv.x.s a0, v8 -; RV64-SLOW-NEXT: lwu a1, 4(a0) -; RV64-SLOW-NEXT: lwu a0, 0(a0) -; RV64-SLOW-NEXT: slli a1, a1, 32 -; RV64-SLOW-NEXT: or a0, a1, a0 +; RV64-SLOW-NEXT: lwu a1, 0(a0) +; RV64-SLOW-NEXT: lwu a0, 4(a0) +; RV64-SLOW-NEXT: slli a0, a0, 32 +; RV64-SLOW-NEXT: or a0, a0, a1 ; RV64-SLOW-NEXT: vmv.s.x v8, a0 ; RV64-SLOW-NEXT: vslideup.vi v9, v8, 1 ; RV64-SLOW-NEXT: .LBB5_4: # %else2 @@ -489,12 +489,12 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi ; RV32-SLOW-NEXT: # implicit-def: $v8 ; RV32-SLOW-NEXT: beqz a3, .LBB8_2 ; RV32-SLOW-NEXT: # %bb.1: # %cond.load -; RV32-SLOW-NEXT: lbu a3, 1(a0) -; RV32-SLOW-NEXT: lbu a4, 0(a0) +; RV32-SLOW-NEXT: lbu a3, 0(a0) +; RV32-SLOW-NEXT: lbu a4, 1(a0) ; RV32-SLOW-NEXT: lbu a5, 2(a0) ; RV32-SLOW-NEXT: lbu a6, 3(a0) -; RV32-SLOW-NEXT: slli a3, a3, 8 -; RV32-SLOW-NEXT: or a3, a3, a4 +; RV32-SLOW-NEXT: slli a4, a4, 8 +; RV32-SLOW-NEXT: or a3, a4, a3 ; RV32-SLOW-NEXT: slli a5, a5, 16 ; RV32-SLOW-NEXT: slli a6, a6, 24 ; RV32-SLOW-NEXT: or a4, a6, a5 @@ -505,12 +505,12 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi ; RV32-SLOW-NEXT: andi a2, a2, 2 ; RV32-SLOW-NEXT: beqz a2, .LBB8_4 ; RV32-SLOW-NEXT: # %bb.3: # %cond.load1 -; RV32-SLOW-NEXT: lbu a2, 5(a0) -; RV32-SLOW-NEXT: lbu a3, 4(a0) +; RV32-SLOW-NEXT: lbu a2, 4(a0) +; RV32-SLOW-NEXT: lbu a3, 5(a0) ; RV32-SLOW-NEXT: lbu a4, 6(a0) ; RV32-SLOW-NEXT: lbu a0, 7(a0) -; RV32-SLOW-NEXT: slli a2, a2, 8 -; RV32-SLOW-NEXT: or a2, a2, a3 +; RV32-SLOW-NEXT: slli a3, a3, 8 +; RV32-SLOW-NEXT: or a2, a3, a2 ; RV32-SLOW-NEXT: slli a4, a4, 16 ; RV32-SLOW-NEXT: slli a0, a0, 24 ; RV32-SLOW-NEXT: or a0, a0, a4 @@ -533,12 +533,12 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi ; RV64-SLOW-NEXT: # implicit-def: $v8 ; RV64-SLOW-NEXT: beqz a3, .LBB8_2 ; RV64-SLOW-NEXT: # %bb.1: # %cond.load -; RV64-SLOW-NEXT: lbu a3, 1(a0) -; RV64-SLOW-NEXT: lbu a4, 0(a0) +; RV64-SLOW-NEXT: lbu a3, 0(a0) +; RV64-SLOW-NEXT: lbu a4, 1(a0) ; RV64-SLOW-NEXT: lbu a5, 2(a0) ; RV64-SLOW-NEXT: lb a6, 3(a0) -; RV64-SLOW-NEXT: slli a3, a3, 8 -; RV64-SLOW-NEXT: or a3, a3, a4 +; RV64-SLOW-NEXT: slli a4, a4, 8 +; RV64-SLOW-NEXT: or a3, a4, a3 ; RV64-SLOW-NEXT: slli a5, a5, 16 ; RV64-SLOW-NEXT: slli a6, a6, 24 ; RV64-SLOW-NEXT: or a4, a6, a5 @@ -549,12 +549,12 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi ; RV64-SLOW-NEXT: andi a2, a2, 2 ; RV64-SLOW-NEXT: beqz a2, .LBB8_4 ; RV64-SLOW-NEXT: # %bb.3: # %cond.load1 -; RV64-SLOW-NEXT: lbu a2, 5(a0) -; RV64-SLOW-NEXT: lbu a3, 4(a0) +; RV64-SLOW-NEXT: lbu a2, 4(a0) +; RV64-SLOW-NEXT: lbu a3, 5(a0) ; RV64-SLOW-NEXT: lbu a4, 6(a0) ; RV64-SLOW-NEXT: lb a0, 7(a0) -; RV64-SLOW-NEXT: slli a2, a2, 8 -; RV64-SLOW-NEXT: or a2, a2, a3 +; RV64-SLOW-NEXT: slli a3, a3, 8 +; RV64-SLOW-NEXT: or a2, a3, a2 ; RV64-SLOW-NEXT: slli a4, a4, 16 ; RV64-SLOW-NEXT: slli a0, a0, 24 ; RV64-SLOW-NEXT: or a0, a0, a4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll index 7ee8179acfdb9..e56b7c75c41d1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll @@ -366,12 +366,12 @@ define <256 x i8> @vadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %ev ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: addi a3, a1, -128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a0) -; CHECK-NEXT: addi a0, a1, -128 -; CHECK-NEXT: sltu a3, a1, a0 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a0, a3, a0 +; CHECK-NEXT: sltu a0, a1, a3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t ; CHECK-NEXT: bltu a1, a2, .LBB32_2 @@ -1357,9 +1357,9 @@ declare <32 x i64> @llvm.vp.add.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vadd_vx_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB108_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll index fa82065f3b413..9678fa87dc9b1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll @@ -298,46 +298,36 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB26_2 +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: bltu a2, a3, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfsgnj.vv v8, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vfsgnj.vv v8, v8, v24, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfsgnj.vv v16, v16, v24, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll index 08f486b601328..990cf03a2e9b5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll @@ -363,9 +363,9 @@ declare <32 x double> @llvm.vp.fabs.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vfabs_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfabs_vv_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll index bde842dcc7600..a6c51ced93ddc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll @@ -849,35 +849,35 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a2, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a2) -; CHECK-NEXT: addi a2, a0, 128 -; CHECK-NEXT: vle64.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v8, (a2) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: addi a3, a2, 128 +; CHECK-NEXT: addi a5, a0, 128 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: bltu a4, a1, .LBB50_2 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a2) +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: mv a1, a4 +; CHECK-NEXT: vle64.v v8, (a3) +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a5) +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: bltu a4, a2, .LBB50_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB50_2: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 @@ -893,16 +893,16 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t ; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 @@ -941,26 +941,26 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a2, 128 +; CHECK-NEXT: addi a3, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a2) -; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a2) +; CHECK-NEXT: mv a1, a4 +; CHECK-NEXT: vle64.v v24, (a3) ; CHECK-NEXT: vle64.v v0, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a4 -; CHECK-NEXT: bltu a4, a1, .LBB51_2 +; CHECK-NEXT: bltu a4, a2, .LBB51_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB51_2: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v0, v8, v16 ; CHECK-NEXT: addi a0, a4, -16 ; CHECK-NEXT: sltu a1, a4, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll index b37c47a32ba21..13c8077a84c56 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll @@ -390,46 +390,36 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB26_2 +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: bltu a2, a3, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmax.vv v8, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vfmax.vv v8, v8, v24, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmax.vv v16, v16, v24, v0.t +; CHECK-NEXT: vfmax.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll index 261523e8ace50..fd43b8bbaf185 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll @@ -390,46 +390,36 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 -; CHECK-NEXT: bltu a2, a1, .LBB26_2 +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: bltu a2, a3, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmin.vv v8, v8, v16, v0.t +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vfmin.vv v8, v8, v24, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmin.vv v16, v16, v24, v0.t +; CHECK-NEXT: vfmin.vv v16, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll index a5d9b3439e29b..eb4ce757a8385 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll @@ -621,35 +621,35 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a2, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a2) -; CHECK-NEXT: addi a2, a0, 128 -; CHECK-NEXT: vle64.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v8, (a2) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: addi a3, a2, 128 +; CHECK-NEXT: addi a5, a0, 128 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: bltu a4, a1, .LBB50_2 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a2) +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: mv a1, a4 +; CHECK-NEXT: vle64.v v8, (a3) +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a5) +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: bltu a4, a2, .LBB50_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB50_2: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 @@ -665,16 +665,16 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vfmadd.vv v8, v24, v16, v0.t ; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 @@ -713,26 +713,26 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> % ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a2, 128 +; CHECK-NEXT: addi a3, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a2) -; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle64.v v24, (a2) +; CHECK-NEXT: mv a1, a4 +; CHECK-NEXT: vle64.v v24, (a3) ; CHECK-NEXT: vle64.v v0, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a4 -; CHECK-NEXT: bltu a4, a1, .LBB51_2 +; CHECK-NEXT: bltu a4, a2, .LBB51_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB51_2: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v0, v8, v16 ; CHECK-NEXT: addi a0, a4, -16 ; CHECK-NEXT: sltu a1, a4, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll index 968fd9f9bab80..a3853d19c3ef9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll @@ -355,9 +355,9 @@ declare <32 x double> @llvm.vp.fneg.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vfneg_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfneg_vv_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll index 6244419de65b1..d87c1e332ce65 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll @@ -379,9 +379,9 @@ declare <32 x double> @llvm.vp.sqrt.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vfsqrt_vv_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfsqrt_vv_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll index fec54b36042fa..28ac46cd5fc88 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll @@ -270,12 +270,12 @@ define <256 x i8> @vmax_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a3, 128 +; CHECK-NEXT: addi a4, a2, -128 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a1) -; CHECK-NEXT: addi a1, a2, -128 -; CHECK-NEXT: sltu a4, a2, a1 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a1, a4, a1 +; CHECK-NEXT: sltu a1, a2, a4 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a1, a1, a4 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vmax.vx v16, v16, a0, v0.t ; CHECK-NEXT: bltu a2, a3, .LBB22_2 @@ -1029,9 +1029,9 @@ declare <32 x i64> @llvm.vp.smax.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vmax_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmax_vx_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB74_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll index 7ca0dbd9adffc..b7555f4b3588b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll @@ -269,12 +269,12 @@ define <256 x i8> @vmaxu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a3, 128 +; CHECK-NEXT: addi a4, a2, -128 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a1) -; CHECK-NEXT: addi a1, a2, -128 -; CHECK-NEXT: sltu a4, a2, a1 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a1, a4, a1 +; CHECK-NEXT: sltu a1, a2, a4 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a1, a1, a4 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vmaxu.vx v16, v16, a0, v0.t ; CHECK-NEXT: bltu a2, a3, .LBB22_2 @@ -1028,9 +1028,9 @@ declare <32 x i64> @llvm.vp.umax.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vmaxu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmaxu_vx_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB74_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll index ea75742ca6e43..bd49b9876575e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll @@ -270,12 +270,12 @@ define <256 x i8> @vmin_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zero ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a3, 128 +; CHECK-NEXT: addi a4, a2, -128 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a1) -; CHECK-NEXT: addi a1, a2, -128 -; CHECK-NEXT: sltu a4, a2, a1 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a1, a4, a1 +; CHECK-NEXT: sltu a1, a2, a4 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a1, a1, a4 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vmin.vx v16, v16, a0, v0.t ; CHECK-NEXT: bltu a2, a3, .LBB22_2 @@ -1029,9 +1029,9 @@ declare <32 x i64> @llvm.vp.smin.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vmin_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmin_vx_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB74_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll index f4f54db64018d..f6e5fd42f07ab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll @@ -269,12 +269,12 @@ define <256 x i8> @vminu_vx_v258i8(<256 x i8> %va, i8 %b, <256 x i1> %m, i32 zer ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a3, 128 +; CHECK-NEXT: addi a4, a2, -128 ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a1) -; CHECK-NEXT: addi a1, a2, -128 -; CHECK-NEXT: sltu a4, a2, a1 -; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: and a1, a4, a1 +; CHECK-NEXT: sltu a1, a2, a4 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a1, a1, a4 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vminu.vx v16, v16, a0, v0.t ; CHECK-NEXT: bltu a2, a3, .LBB22_2 @@ -1028,9 +1028,9 @@ declare <32 x i64> @llvm.vp.umin.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32) define <32 x i64> @vminu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vminu_vx_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB74_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll index 1f6513ae09d60..36cc8dd25bf94 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -2052,13 +2052,13 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v24, (zero), v8, v0.t ; RV32-NEXT: addi a1, a0, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 16 ; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (zero), v8, v0.t ; RV32-NEXT: vmv8r.v v8, v24 @@ -2077,9 +2077,9 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex ; RV64-NEXT: addi a1, a0, -16 ; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a1 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a0, a0, a1 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV64-NEXT: vluxei64.v v16, (zero), v16, v0.t ; RV64-NEXT: ret @@ -2093,8 +2093,8 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs, ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vsext.vf4 v16, v8 -; RV32-NEXT: li a3, 16 ; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a2, a1 ; RV32-NEXT: bltu a1, a3, .LBB95_2 ; RV32-NEXT: # %bb.1: @@ -2103,13 +2103,13 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs, ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2119,11 +2119,11 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs, ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64-NEXT: vslidedown.vi v10, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsext.vf8 v24, v10 +; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsll.vi v16, v24, 3 ; RV64-NEXT: li a3, 16 -; RV64-NEXT: vsext.vf8 v16, v10 -; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB95_2 ; RV64-NEXT: # %bb.1: @@ -2134,9 +2134,9 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs, ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: ret @@ -2151,8 +2151,8 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vsext.vf4 v16, v8 -; RV32-NEXT: li a3, 16 ; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a2, a1 ; RV32-NEXT: bltu a1, a3, .LBB96_2 ; RV32-NEXT: # %bb.1: @@ -2161,13 +2161,13 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2175,14 +2175,14 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV64-LABEL: vpgather_baseidx_sext_v32i8_v32f64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 16 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB96_2 ; RV64-NEXT: # %bb.1: @@ -2193,9 +2193,9 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: ret @@ -2210,11 +2210,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 ; RV32-NEXT: li a3, 8 -; RV32-NEXT: li a4, 16 ; RV32-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; RV32-NEXT: vwmulu.vx v16, v8, a3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a2, a1 -; RV32-NEXT: bltu a1, a4, .LBB97_2 +; RV32-NEXT: bltu a1, a3, .LBB97_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB97_2: @@ -2225,9 +2225,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV32-NEXT: addi a2, a1, -16 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei16.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2236,11 +2236,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV64: # %bb.0: ; RV64-NEXT: li a2, 32 ; RV64-NEXT: li a3, 8 -; RV64-NEXT: li a4, 16 ; RV64-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; RV64-NEXT: vwmulu.vx v16, v8, a3 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a2, a1 -; RV64-NEXT: bltu a1, a4, .LBB97_2 +; RV64-NEXT: bltu a1, a3, .LBB97_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB97_2: @@ -2251,9 +2251,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei16.v v16, (a0), v24, v0.t ; RV64-NEXT: ret @@ -2268,11 +2268,11 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 ; RV32-NEXT: li a3, 8 -; RV32-NEXT: li a4, 16 ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; RV32-NEXT: vwmulsu.vx v16, v8, a3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a2, a1 -; RV32-NEXT: bltu a1, a4, .LBB98_2 +; RV32-NEXT: bltu a1, a3, .LBB98_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB98_2: @@ -2283,9 +2283,9 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs ; RV32-NEXT: addi a2, a1, -16 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2295,11 +2295,11 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma ; RV64-NEXT: vslidedown.vi v12, v8, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsext.vf4 v16, v8 +; RV64-NEXT: vsext.vf4 v24, v12 +; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsll.vi v16, v24, 3 ; RV64-NEXT: li a3, 16 -; RV64-NEXT: vsext.vf4 v16, v12 -; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB98_2 ; RV64-NEXT: # %bb.1: @@ -2310,9 +2310,9 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: ret @@ -2326,11 +2326,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 ; RV32-NEXT: li a3, 8 -; RV32-NEXT: li a4, 16 ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; RV32-NEXT: vwmulsu.vx v16, v8, a3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a2, a1 -; RV32-NEXT: bltu a1, a4, .LBB99_2 +; RV32-NEXT: bltu a1, a3, .LBB99_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB99_2: @@ -2341,9 +2341,9 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> ; RV32-NEXT: addi a2, a1, -16 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2351,14 +2351,14 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> ; RV64-LABEL: vpgather_baseidx_sext_v32i16_v32f64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsext.vf4 v16, v8 ; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 16 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf4 v16, v8 -; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v8, v16, 3 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB99_2 ; RV64-NEXT: # %bb.1: @@ -2369,9 +2369,9 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: ret @@ -2386,11 +2386,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 ; RV32-NEXT: li a3, 8 -; RV32-NEXT: li a4, 16 ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; RV32-NEXT: vwmulu.vx v16, v8, a3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a2, a1 -; RV32-NEXT: bltu a1, a4, .LBB100_2 +; RV32-NEXT: bltu a1, a3, .LBB100_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB100_2: @@ -2401,9 +2401,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> ; RV32-NEXT: addi a2, a1, -16 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 -; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2412,11 +2412,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> ; RV64: # %bb.0: ; RV64-NEXT: li a2, 32 ; RV64-NEXT: li a3, 8 -; RV64-NEXT: li a4, 16 ; RV64-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; RV64-NEXT: vwmulu.vx v16, v8, a3 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a2, a1 -; RV64-NEXT: bltu a1, a4, .LBB100_2 +; RV64-NEXT: bltu a1, a3, .LBB100_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB100_2: @@ -2427,9 +2427,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV64-NEXT: ret @@ -2443,9 +2443,9 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs ; RV32-LABEL: vpgather_baseidx_v32i32_v32f64: ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 -; RV32-NEXT: li a3, 16 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vsll.vi v16, v8, 3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a2, a1 ; RV32-NEXT: bltu a1, a3, .LBB101_2 ; RV32-NEXT: # %bb.1: @@ -2454,13 +2454,13 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2470,10 +2470,10 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 ; RV64-NEXT: li a2, 8 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vwmulsu.vx v16, v24, a2 ; RV64-NEXT: vwmulsu.vx v24, v8, a2 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB101_2 ; RV64-NEXT: # %bb.1: @@ -2484,9 +2484,9 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: ret @@ -2499,9 +2499,9 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32> ; RV32-LABEL: vpgather_baseidx_sext_v32i32_v32f64: ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 -; RV32-NEXT: li a3, 16 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vsll.vi v16, v8, 3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a2, a1 ; RV32-NEXT: bltu a1, a3, .LBB102_2 ; RV32-NEXT: # %bb.1: @@ -2510,13 +2510,13 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32> ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2526,10 +2526,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32> ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 ; RV64-NEXT: li a2, 8 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vwmulsu.vx v16, v24, a2 ; RV64-NEXT: vwmulsu.vx v24, v8, a2 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB102_2 ; RV64-NEXT: # %bb.1: @@ -2540,9 +2540,9 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32> ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: ret @@ -2556,9 +2556,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32> ; RV32-LABEL: vpgather_baseidx_zext_v32i32_v32f64: ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 -; RV32-NEXT: li a3, 16 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vsll.vi v16, v8, 3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a2, a1 ; RV32-NEXT: bltu a1, a3, .LBB103_2 ; RV32-NEXT: # %bb.1: @@ -2567,13 +2567,13 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32> ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2583,10 +2583,10 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32> ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 ; RV64-NEXT: li a2, 8 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vwmulu.vx v16, v24, a2 ; RV64-NEXT: vwmulu.vx v24, v8, a2 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB103_2 ; RV64-NEXT: # %bb.1: @@ -2597,9 +2597,9 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32> ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: ret @@ -2618,16 +2618,16 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x ; RV32-NEXT: vnsrl.wi v16, v8, 0 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: addi a3, a1, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vslideup.vi v16, v24, 16 ; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: sltu a2, a1, a3 ; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: and a2, a2, a3 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: li a2, 16 @@ -2644,8 +2644,8 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB104_2 ; RV64-NEXT: # %bb.1: @@ -2656,9 +2656,9 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a1, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll index 6c9989775f790..4f3179823f5b0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll @@ -374,12 +374,12 @@ define <32 x double> @vpload_v32f64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0), v0.t ; CHECK-NEXT: addi a2, a1, -16 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: sltu a1, a1, a2 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a1, a1, a2 -; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a0), v0.t ; CHECK-NEXT: ret @@ -403,12 +403,12 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: addi a4, a3, -16 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v8, 2 ; CHECK-NEXT: sltu a3, a3, a4 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a3, a3, a4 ; CHECK-NEXT: addi a4, a1, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v8, 2 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a4), v0.t ; CHECK-NEXT: addi a3, a2, -32 @@ -420,9 +420,9 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: .LBB32_4: +; CHECK-NEXT: addi a5, a1, 256 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v8, 4 -; CHECK-NEXT: addi a5, a1, 256 ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v24, (a5), v0.t ; CHECK-NEXT: bltu a2, a3, .LBB32_6 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll index 6394542479d1b..c6e64fe2bd32e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll @@ -1360,22 +1360,22 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bltu a2, a1, .LBB83_2 +; CHECK-NEXT: bltu a2, a3, .LBB83_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB83_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma +; CHECK-NEXT: vsetvli zero, a1, e64, m8, tu, ma ; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a0, a1, a0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 2 -; CHECK-NEXT: and a0, a1, a0 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma @@ -1406,9 +1406,9 @@ define <32 x double> @vpmerge_vf_v32f64(double %a, <32 x double> %vb, <32 x i1> ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 2 -; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, tu, ma ; CHECK-NEXT: vfmerge.vfm v16, v16, fa0, v0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll index f7e4716d2c847..cf5650c0ab4ed 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll @@ -1756,13 +1756,13 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (zero), v24, v0.t ; RV32-NEXT: addi a0, a1, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: sltu a1, a1, a0 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (zero), v8, v0.t ; RV32-NEXT: ret @@ -1778,23 +1778,23 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: li a1, 16 -; RV64-NEXT: mv a0, a2 -; RV64-NEXT: bltu a2, a1, .LBB83_2 +; RV64-NEXT: bltu a2, a3, .LBB83_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a0, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB83_2: -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t ; RV64-NEXT: addi a0, a2, -16 ; RV64-NEXT: sltu a1, a2, a0 ; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a0, a1, a0 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a0, a1, a0 ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -1816,8 +1816,8 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32 ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a1) -; RV32-NEXT: li a3, 16 ; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a1, a2 ; RV32-NEXT: bltu a2, a3, .LBB84_2 ; RV32-NEXT: # %bb.1: @@ -1826,13 +1826,13 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: addi a1, a2, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: sltu a2, a2, a1 ; RV32-NEXT: addi a2, a2, -1 ; RV32-NEXT: and a1, a2, a1 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: ret @@ -1854,14 +1854,14 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32 ; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV64-NEXT: vle32.v v24, (a1) +; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: li a1, 8 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v24, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vwmulsu.vx v8, v16, a1 ; RV64-NEXT: vwmulsu.vx v16, v24, a1 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB84_2 ; RV64-NEXT: # %bb.1: @@ -1870,20 +1870,20 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32 ; RV64-NEXT: addi a3, sp, 16 ; RV64-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t ; RV64-NEXT: addi a1, a2, -16 ; RV64-NEXT: sltu a2, a2, a1 ; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a1, a2, a1 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a2, a1 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 @@ -1902,8 +1902,8 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a1) -; RV32-NEXT: li a3, 16 ; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a1, a2 ; RV32-NEXT: bltu a2, a3, .LBB85_2 ; RV32-NEXT: # %bb.1: @@ -1912,13 +1912,13 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: addi a1, a2, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: sltu a2, a2, a1 ; RV32-NEXT: addi a2, a2, -1 ; RV32-NEXT: and a1, a2, a1 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: ret @@ -1940,14 +1940,14 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV64-NEXT: vle32.v v24, (a1) +; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: li a1, 8 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v24, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vwmulsu.vx v8, v16, a1 ; RV64-NEXT: vwmulsu.vx v16, v24, a1 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB85_2 ; RV64-NEXT: # %bb.1: @@ -1956,20 +1956,20 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: addi a3, sp, 16 ; RV64-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t ; RV64-NEXT: addi a1, a2, -16 ; RV64-NEXT: sltu a2, a2, a1 ; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a1, a2, a1 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a2, a1 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 @@ -1989,8 +1989,8 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a1) -; RV32-NEXT: li a3, 16 ; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: mv a1, a2 ; RV32-NEXT: bltu a2, a3, .LBB86_2 ; RV32-NEXT: # %bb.1: @@ -1999,13 +1999,13 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: addi a1, a2, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: sltu a2, a2, a1 ; RV32-NEXT: addi a2, a2, -1 ; RV32-NEXT: and a1, a2, a1 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (a0), v8, v0.t ; RV32-NEXT: ret @@ -2027,14 +2027,14 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV64-NEXT: vle32.v v24, (a1) +; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: li a1, 8 -; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v24, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vwmulu.vx v8, v16, a1 ; RV64-NEXT: vwmulu.vx v16, v24, a1 +; RV64-NEXT: li a3, 16 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB86_2 ; RV64-NEXT: # %bb.1: @@ -2043,20 +2043,20 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: addi a3, sp, 16 ; RV64-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v24, (a0), v8, v0.t ; RV64-NEXT: addi a1, a2, -16 ; RV64-NEXT: sltu a2, a2, a1 ; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a1, a2, a1 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 -; RV64-NEXT: and a1, a2, a1 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 4 ; RV64-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll index d30e8b46e6df2..d3a8e8548f5b4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll @@ -304,12 +304,12 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v8, (a0), v0.t ; CHECK-NEXT: addi a2, a1, -16 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: sltu a1, a1, a2 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a1, a1, a2 -; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vse64.v v16, (a0), v0.t ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll index 7afd31fdd663c..8a15fa6929708 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll @@ -375,12 +375,12 @@ define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: addi a3, a1, -128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a0) -; CHECK-NEXT: addi a0, a1, -128 -; CHECK-NEXT: sltu a3, a1, a0 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a0, a3, a0 +; CHECK-NEXT: sltu a0, a1, a3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t ; CHECK-NEXT: bltu a1, a2, .LBB32_2 @@ -1370,9 +1370,9 @@ declare <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vsadd_vx_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB108_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll index f61b112fd8024..0f2ff55d767d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll @@ -371,12 +371,12 @@ define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext % ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: addi a3, a1, -128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vlm.v v0, (a0) -; CHECK-NEXT: addi a0, a1, -128 -; CHECK-NEXT: sltu a3, a1, a0 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a0, a3, a0 +; CHECK-NEXT: sltu a0, a1, a3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t ; CHECK-NEXT: bltu a1, a2, .LBB32_2 @@ -1366,9 +1366,9 @@ declare <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vsaddu_vx_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB108_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll index dc83edba5ae8c..c5506e175ce00 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll @@ -8,104 +8,48 @@ define <512 x i8> @vadd_v512i8_zvl128(<512 x i8> %a, <512 x i8> %b) #0 { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a4, 48 -; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 5 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a4, 40 -; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: li a2, 128 -; CHECK-NEXT: addi a4, a3, 128 -; CHECK-NEXT: addi a5, a3, 384 +; CHECK-NEXT: addi a4, a3, 256 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a5) -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a5, 24 -; CHECK-NEXT: mul a2, a2, a5 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: addi a2, a1, 128 -; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a3, 256 -; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vle8.v v8, (a2) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vle8.v v24, (a4) +; CHECK-NEXT: addi a2, a3, 384 +; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: addi a1, a1, 128 +; CHECK-NEXT: vadd.vv v8, v0, v24 +; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v0, (a2) +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vadd.vv v24, v24, v0 +; CHECK-NEXT: addi a1, a3, 128 +; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: vadd.vv v16, v16, v0 ; CHECK-NEXT: vle8.v v0, (a3) ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 24 -; CHECK-NEXT: mul a1, a1, a2 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v16, v16, v8 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v24, v8, v24 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 40 -; CHECK-NEXT: mul a1, a1, a2 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vadd.vv v0, v8, v0 ; CHECK-NEXT: vse8.v v0, (a0) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vse8.v v16, (a1) +; CHECK-NEXT: vse8.v v24, (a1) ; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: addi a2, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vse8.v v24, (a0) +; CHECK-NEXT: vse8.v v16, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 48 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -121,10 +65,10 @@ define <512 x i8> @vadd_v512i8_zvl256(<512 x i8> %a, <512 x i8> %b) #1 { ; CHECK-NEXT: addi a1, a0, 256 ; CHECK-NEXT: li a2, 256 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v24, (a0) -; CHECK-NEXT: vle8.v v0, (a1) -; CHECK-NEXT: vadd.vv v8, v8, v24 -; CHECK-NEXT: vadd.vv v16, v16, v0 +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v0, (a0) +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v16, v24 ; CHECK-NEXT: ret %c = add <512 x i8> %a, %b ret <512 x i8> %c diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll index 05254e60b65b7..81c98d6881e72 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll @@ -155,46 +155,30 @@ declare <256 x i8> @llvm.vp.select.v256i8(<256 x i1>, <256 x i8>, <256 x i8>, i3 define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i32 zeroext %evl) { ; CHECK-LABEL: select_v256i8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv1r.v v6, v8 ; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: addi a4, a1, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: addi a0, a3, -128 -; CHECK-NEXT: vle8.v v8, (a4) +; CHECK-NEXT: vle8.v v24, (a4) ; CHECK-NEXT: sltu a4, a3, a0 -; CHECK-NEXT: vle8.v v16, (a1) ; CHECK-NEXT: addi a4, a4, -1 ; CHECK-NEXT: and a0, a4, a0 -; CHECK-NEXT: vmv1r.v v0, v6 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 +; CHECK-NEXT: vmerge.vvm v24, v24, v8, v0 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: bltu a3, a2, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: .LBB11_2: ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: vmv8r.v v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call <256 x i8> @llvm.vp.select.v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i32 %evl) ret <256 x i8> %v @@ -203,58 +187,21 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3 define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c) { ; CHECK-LABEL: select_evl_v256i8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a2, a2, a3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, a1, 128 ; CHECK-NEXT: vle8.v v24, (a0) -; CHECK-NEXT: vle8.v v16, (a1) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, a1, 128 +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: vmv8r.v v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call <256 x i8> @llvm.vp.select.v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i32 129) ret <256 x i8> %v @@ -418,23 +365,23 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a1) +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle64.v v24, (a0) -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bltu a2, a1, .LBB25_2 +; CHECK-NEXT: bltu a2, a3, .LBB25_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB25_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a0, a1, a0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 2 -; CHECK-NEXT: and a0, a1, a0 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -453,56 +400,16 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c) { ; CHECK-LABEL: select_evl_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 24 -; CHECK-NEXT: mul a1, a1, a2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a1, a0, 128 ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle64.v v24, (a0) ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 ; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.select.v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 17) ret <32 x i64> %v @@ -621,20 +528,20 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> % ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: bltu a2, a3, .LBB35_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: .LBB35_2: -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 ; CHECK-NEXT: addi a0, a2, -32 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a0, a1, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 4 -; CHECK-NEXT: and a0, a1, a0 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll index 557882ee31d4c..75f0119d14c2a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll @@ -5,26 +5,26 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV32-LABEL: vselect_vv_v6i32: ; RV32: # %bb.0: -; RV32-NEXT: lbu a2, 0(a2) ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a1) -; RV32-NEXT: slli a1, a2, 30 -; RV32-NEXT: andi a4, a2, 1 +; RV32-NEXT: lbu a1, 0(a2) +; RV32-NEXT: slli a2, a1, 30 +; RV32-NEXT: andi a4, a1, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a4 -; RV32-NEXT: slli a4, a2, 29 -; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 28 +; RV32-NEXT: slli a4, a1, 29 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: slli a2, a1, 28 ; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a4 -; RV32-NEXT: slli a4, a2, 27 -; RV32-NEXT: srli a2, a2, 5 -; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: slli a4, a1, 27 +; RV32-NEXT: srli a1, a1, 5 +; RV32-NEXT: srli a2, a2, 31 ; RV32-NEXT: srli a4, a4, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a4 ; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: vslide1down.vx v10, v10, a4 +; RV32-NEXT: vslide1down.vx v10, v10, a1 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 ; RV32-NEXT: vmsne.vi v0, v10, 0 @@ -35,26 +35,26 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; ; RV64-LABEL: vselect_vv_v6i32: ; RV64: # %bb.0: -; RV64-NEXT: lbu a2, 0(a2) ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a1) -; RV64-NEXT: slli a1, a2, 62 -; RV64-NEXT: andi a4, a2, 1 +; RV64-NEXT: lbu a1, 0(a2) +; RV64-NEXT: slli a2, a1, 62 +; RV64-NEXT: andi a4, a1, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a4 -; RV64-NEXT: slli a4, a2, 61 -; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 60 +; RV64-NEXT: slli a4, a1, 61 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: slli a2, a1, 60 ; RV64-NEXT: srli a4, a4, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a4 -; RV64-NEXT: slli a4, a2, 59 -; RV64-NEXT: srli a2, a2, 5 -; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: slli a4, a1, 59 +; RV64-NEXT: srli a1, a1, 5 +; RV64-NEXT: srli a2, a2, 63 ; RV64-NEXT: srli a4, a4, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: vslide1down.vx v10, v10, a4 ; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: vslide1down.vx v10, v10, a4 +; RV64-NEXT: vslide1down.vx v10, v10, a1 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 ; RV64-NEXT: vmsne.vi v0, v10, 0 @@ -73,26 +73,26 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) { define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) { ; RV32-LABEL: vselect_vx_v6i32: ; RV32: # %bb.0: -; RV32-NEXT: lbu a2, 0(a2) ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a1) -; RV32-NEXT: slli a1, a2, 30 -; RV32-NEXT: andi a4, a2, 1 +; RV32-NEXT: lbu a1, 0(a2) +; RV32-NEXT: slli a2, a1, 30 +; RV32-NEXT: andi a4, a1, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a4 -; RV32-NEXT: slli a4, a2, 29 -; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 28 +; RV32-NEXT: slli a4, a1, 29 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: slli a2, a1, 28 ; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a4 -; RV32-NEXT: slli a4, a2, 27 -; RV32-NEXT: srli a2, a2, 5 -; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: slli a4, a1, 27 +; RV32-NEXT: srli a1, a1, 5 +; RV32-NEXT: srli a2, a2, 31 ; RV32-NEXT: srli a4, a4, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a4 ; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: vslide1down.vx v10, v10, a4 +; RV32-NEXT: vslide1down.vx v10, v10, a1 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 ; RV32-NEXT: vmsne.vi v0, v10, 0 @@ -103,26 +103,26 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) { ; ; RV64-LABEL: vselect_vx_v6i32: ; RV64: # %bb.0: -; RV64-NEXT: lbu a2, 0(a2) ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a1) -; RV64-NEXT: slli a1, a2, 62 -; RV64-NEXT: andi a4, a2, 1 +; RV64-NEXT: lbu a1, 0(a2) +; RV64-NEXT: slli a2, a1, 62 +; RV64-NEXT: andi a4, a1, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a4 -; RV64-NEXT: slli a4, a2, 61 -; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 60 +; RV64-NEXT: slli a4, a1, 61 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: slli a2, a1, 60 ; RV64-NEXT: srli a4, a4, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a4 -; RV64-NEXT: slli a4, a2, 59 -; RV64-NEXT: srli a2, a2, 5 -; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: slli a4, a1, 59 +; RV64-NEXT: srli a1, a1, 5 +; RV64-NEXT: srli a2, a2, 63 ; RV64-NEXT: srli a4, a4, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: vslide1down.vx v10, v10, a4 ; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: vslide1down.vx v10, v10, a4 +; RV64-NEXT: vslide1down.vx v10, v10, a1 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 ; RV64-NEXT: vmsne.vi v0, v10, 0 @@ -142,26 +142,26 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) { define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) { ; RV32-LABEL: vselect_vi_v6i32: ; RV32: # %bb.0: -; RV32-NEXT: lbu a1, 0(a1) ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: slli a0, a1, 30 -; RV32-NEXT: andi a3, a1, 1 +; RV32-NEXT: lbu a0, 0(a1) +; RV32-NEXT: slli a1, a0, 30 +; RV32-NEXT: andi a3, a0, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a3 -; RV32-NEXT: slli a3, a1, 29 -; RV32-NEXT: srli a0, a0, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: slli a3, a0, 29 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a0, 28 ; RV32-NEXT: srli a3, a3, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a3 -; RV32-NEXT: slli a3, a1, 27 -; RV32-NEXT: srli a1, a1, 5 -; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: slli a3, a0, 27 +; RV32-NEXT: srli a0, a0, 5 +; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: srli a3, a3, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: vslide1down.vx v10, v10, a3 ; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: vslide1down.vx v10, v10, a3 +; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 ; RV32-NEXT: vmsne.vi v0, v10, 0 @@ -172,26 +172,26 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) { ; ; RV64-LABEL: vselect_vi_v6i32: ; RV64: # %bb.0: -; RV64-NEXT: lbu a1, 0(a1) ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: slli a0, a1, 62 -; RV64-NEXT: andi a3, a1, 1 +; RV64-NEXT: lbu a0, 0(a1) +; RV64-NEXT: slli a1, a0, 62 +; RV64-NEXT: andi a3, a0, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a3 -; RV64-NEXT: slli a3, a1, 61 -; RV64-NEXT: srli a0, a0, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: slli a3, a0, 61 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a0, 60 ; RV64-NEXT: srli a3, a3, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a3 -; RV64-NEXT: slli a3, a1, 59 -; RV64-NEXT: srli a1, a1, 5 -; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: slli a3, a0, 59 +; RV64-NEXT: srli a0, a0, 5 +; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: srli a3, a3, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: vslide1down.vx v10, v10, a3 ; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: vslide1down.vx v10, v10, a3 +; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 ; RV64-NEXT: vmsne.vi v0, v10, 0 @@ -210,26 +210,26 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) { define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV32-LABEL: vselect_vv_v6f32: ; RV32: # %bb.0: -; RV32-NEXT: lbu a2, 0(a2) ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a1) -; RV32-NEXT: slli a1, a2, 30 -; RV32-NEXT: andi a4, a2, 1 +; RV32-NEXT: lbu a1, 0(a2) +; RV32-NEXT: slli a2, a1, 30 +; RV32-NEXT: andi a4, a1, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a4 -; RV32-NEXT: slli a4, a2, 29 -; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 28 +; RV32-NEXT: slli a4, a1, 29 +; RV32-NEXT: srli a2, a2, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: slli a2, a1, 28 ; RV32-NEXT: srli a4, a4, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a4 -; RV32-NEXT: slli a4, a2, 27 -; RV32-NEXT: srli a2, a2, 5 -; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: slli a4, a1, 27 +; RV32-NEXT: srli a1, a1, 5 +; RV32-NEXT: srli a2, a2, 31 ; RV32-NEXT: srli a4, a4, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a4 ; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: vslide1down.vx v10, v10, a4 +; RV32-NEXT: vslide1down.vx v10, v10, a1 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 ; RV32-NEXT: vmsne.vi v0, v10, 0 @@ -240,26 +240,26 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; ; RV64-LABEL: vselect_vv_v6f32: ; RV64: # %bb.0: -; RV64-NEXT: lbu a2, 0(a2) ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a1) -; RV64-NEXT: slli a1, a2, 62 -; RV64-NEXT: andi a4, a2, 1 +; RV64-NEXT: lbu a1, 0(a2) +; RV64-NEXT: slli a2, a1, 62 +; RV64-NEXT: andi a4, a1, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a4 -; RV64-NEXT: slli a4, a2, 61 -; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 60 +; RV64-NEXT: slli a4, a1, 61 +; RV64-NEXT: srli a2, a2, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: slli a2, a1, 60 ; RV64-NEXT: srli a4, a4, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a4 -; RV64-NEXT: slli a4, a2, 59 -; RV64-NEXT: srli a2, a2, 5 -; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: slli a4, a1, 59 +; RV64-NEXT: srli a1, a1, 5 +; RV64-NEXT: srli a2, a2, 63 ; RV64-NEXT: srli a4, a4, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: vslide1down.vx v10, v10, a4 ; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: vslide1down.vx v10, v10, a4 +; RV64-NEXT: vslide1down.vx v10, v10, a1 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 ; RV64-NEXT: vmsne.vi v0, v10, 0 @@ -278,26 +278,26 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) { define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) { ; RV32-LABEL: vselect_vx_v6f32: ; RV32: # %bb.0: -; RV32-NEXT: lbu a1, 0(a1) ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: slli a0, a1, 30 -; RV32-NEXT: andi a3, a1, 1 +; RV32-NEXT: lbu a0, 0(a1) +; RV32-NEXT: slli a1, a0, 30 +; RV32-NEXT: andi a3, a0, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a3 -; RV32-NEXT: slli a3, a1, 29 -; RV32-NEXT: srli a0, a0, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: slli a3, a0, 29 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a0, 28 ; RV32-NEXT: srli a3, a3, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a3 -; RV32-NEXT: slli a3, a1, 27 -; RV32-NEXT: srli a1, a1, 5 -; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: slli a3, a0, 27 +; RV32-NEXT: srli a0, a0, 5 +; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: srli a3, a3, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: vslide1down.vx v10, v10, a3 ; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: vslide1down.vx v10, v10, a3 +; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 ; RV32-NEXT: vmsne.vi v0, v10, 0 @@ -308,26 +308,26 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) { ; ; RV64-LABEL: vselect_vx_v6f32: ; RV64: # %bb.0: -; RV64-NEXT: lbu a1, 0(a1) ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: slli a0, a1, 62 -; RV64-NEXT: andi a3, a1, 1 +; RV64-NEXT: lbu a0, 0(a1) +; RV64-NEXT: slli a1, a0, 62 +; RV64-NEXT: andi a3, a0, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a3 -; RV64-NEXT: slli a3, a1, 61 -; RV64-NEXT: srli a0, a0, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: slli a3, a0, 61 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a0, 60 ; RV64-NEXT: srli a3, a3, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a3 -; RV64-NEXT: slli a3, a1, 59 -; RV64-NEXT: srli a1, a1, 5 -; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: slli a3, a0, 59 +; RV64-NEXT: srli a0, a0, 5 +; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: srli a3, a3, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: vslide1down.vx v10, v10, a3 ; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: vslide1down.vx v10, v10, a3 +; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 ; RV64-NEXT: vmsne.vi v0, v10, 0 @@ -347,26 +347,26 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) { define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) { ; RV32-LABEL: vselect_vfpzero_v6f32: ; RV32: # %bb.0: -; RV32-NEXT: lbu a1, 0(a1) ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: slli a0, a1, 30 -; RV32-NEXT: andi a3, a1, 1 +; RV32-NEXT: lbu a0, 0(a1) +; RV32-NEXT: slli a1, a0, 30 +; RV32-NEXT: andi a3, a0, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.x v10, a3 -; RV32-NEXT: slli a3, a1, 29 -; RV32-NEXT: srli a0, a0, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: slli a0, a1, 28 +; RV32-NEXT: slli a3, a0, 29 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: slli a1, a0, 28 ; RV32-NEXT: srli a3, a3, 31 ; RV32-NEXT: vslide1down.vx v10, v10, a3 -; RV32-NEXT: slli a3, a1, 27 -; RV32-NEXT: srli a1, a1, 5 -; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: slli a3, a0, 27 +; RV32-NEXT: srli a0, a0, 5 +; RV32-NEXT: srli a1, a1, 31 ; RV32-NEXT: srli a3, a3, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: vslide1down.vx v10, v10, a3 ; RV32-NEXT: vslide1down.vx v10, v10, a1 +; RV32-NEXT: vslide1down.vx v10, v10, a3 +; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: vslidedown.vi v10, v10, 2 ; RV32-NEXT: vand.vi v10, v10, 1 ; RV32-NEXT: vmsne.vi v0, v10, 0 @@ -377,26 +377,26 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) { ; ; RV64-LABEL: vselect_vfpzero_v6f32: ; RV64: # %bb.0: -; RV64-NEXT: lbu a1, 0(a1) ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: slli a0, a1, 62 -; RV64-NEXT: andi a3, a1, 1 +; RV64-NEXT: lbu a0, 0(a1) +; RV64-NEXT: slli a1, a0, 62 +; RV64-NEXT: andi a3, a0, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.x v10, a3 -; RV64-NEXT: slli a3, a1, 61 -; RV64-NEXT: srli a0, a0, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: slli a0, a1, 60 +; RV64-NEXT: slli a3, a0, 61 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: slli a1, a0, 60 ; RV64-NEXT: srli a3, a3, 63 ; RV64-NEXT: vslide1down.vx v10, v10, a3 -; RV64-NEXT: slli a3, a1, 59 -; RV64-NEXT: srli a1, a1, 5 -; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: slli a3, a0, 59 +; RV64-NEXT: srli a0, a0, 5 +; RV64-NEXT: srli a1, a1, 63 ; RV64-NEXT: srli a3, a3, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a0 -; RV64-NEXT: vslide1down.vx v10, v10, a3 ; RV64-NEXT: vslide1down.vx v10, v10, a1 +; RV64-NEXT: vslide1down.vx v10, v10, a3 +; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: vslidedown.vi v10, v10, 2 ; RV64-NEXT: vand.vi v10, v10, 1 ; RV64-NEXT: vmsne.vi v0, v10, 0 @@ -415,8 +415,8 @@ define void @vselect_vv_v8i32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; CHECK-LABEL: vselect_vv_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vle32.v v8, (a0), v0.t ; CHECK-NEXT: vse32.v v8, (a3) ; CHECK-NEXT: ret @@ -432,8 +432,8 @@ define void @vselect_vx_v8i32(i32 %a, ptr %b, ptr %cc, ptr %z) { ; CHECK-LABEL: vselect_vx_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-NEXT: vse32.v v8, (a3) ; CHECK-NEXT: ret @@ -450,8 +450,8 @@ define void @vselect_vi_v8i32(ptr %b, ptr %cc, ptr %z) { ; CHECK-LABEL: vselect_vi_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vmerge.vim v8, v8, -1, v0 ; CHECK-NEXT: vse32.v v8, (a2) ; CHECK-NEXT: ret @@ -466,8 +466,8 @@ define void @vselect_vv_v8f32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; CHECK-LABEL: vselect_vv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vle32.v v8, (a0), v0.t ; CHECK-NEXT: vse32.v v8, (a3) ; CHECK-NEXT: ret @@ -483,8 +483,8 @@ define void @vselect_vx_v8f32(float %a, ptr %b, ptr %cc, ptr %z) { ; CHECK-LABEL: vselect_vx_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0 ; CHECK-NEXT: vse32.v v8, (a2) ; CHECK-NEXT: ret @@ -501,8 +501,8 @@ define void @vselect_vfpzero_v8f32(ptr %b, ptr %cc, ptr %z) { ; CHECK-LABEL: vselect_vfpzero_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 ; CHECK-NEXT: vse32.v v8, (a2) ; CHECK-NEXT: ret @@ -517,8 +517,8 @@ define void @vselect_vv_v16i16(ptr %a, ptr %b, ptr %cc, ptr %z) { ; CHECK-LABEL: vselect_vv_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vle16.v v8, (a0), v0.t ; CHECK-NEXT: vse16.v v8, (a3) ; CHECK-NEXT: ret @@ -534,8 +534,8 @@ define void @vselect_vx_v16i16(i16 signext %a, ptr %b, ptr %cc, ptr %z) { ; CHECK-LABEL: vselect_vx_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 ; CHECK-NEXT: vse16.v v8, (a3) ; CHECK-NEXT: ret @@ -552,8 +552,8 @@ define void @vselect_vi_v16i16(ptr %b, ptr %cc, ptr %z) { ; CHECK-LABEL: vselect_vi_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vmerge.vim v8, v8, 4, v0 ; CHECK-NEXT: vse16.v v8, (a2) ; CHECK-NEXT: ret @@ -569,8 +569,8 @@ define void @vselect_vv_v32f16(ptr %a, ptr %b, ptr %cc, ptr %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: vsetvli zero, a4, e16, m4, ta, mu -; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: vle16.v v8, (a0), v0.t ; CHECK-NEXT: vse16.v v8, (a3) ; CHECK-NEXT: ret @@ -587,8 +587,8 @@ define void @vselect_vx_v32f16(half %a, ptr %b, ptr %cc, ptr %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0 ; CHECK-NEXT: vse16.v v8, (a2) ; CHECK-NEXT: ret @@ -606,8 +606,8 @@ define void @vselect_vfpzero_v32f16(ptr %b, ptr %cc, ptr %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vlm.v v0, (a1) ; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 ; CHECK-NEXT: vse16.v v8, (a2) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll index 6ddf2e464750e..3e64b019643d1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll @@ -1410,9 +1410,9 @@ declare <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vssub_vx_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB108_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll index c403593894794..8ad1fc384364b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll @@ -1405,9 +1405,9 @@ declare <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vssubu_vx_v32i64: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB108_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll index d241b78e41391..5a343b35e8fad 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll @@ -41,8 +41,8 @@ define <8 x i64> @vwaddu_vv_mask_v8i32(<8 x i32> %x, <8 x i32> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 42 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0 ; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmslt.vx v0, v8, a0 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vwaddu.vv v12, v8, v10 ; CHECK-NEXT: vmv4r.v v8, v12 @@ -77,8 +77,8 @@ define <8 x i64> @vwadd_wv_mask_v8i32_nonzero(<8 x i32> %x, <8 x i64> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 42 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0 ; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vmslt.vx v0, v8, a0 ; CHECK-NEXT: vmerge.vvm v16, v10, v8, v0 ; CHECK-NEXT: vwadd.wv v8, v12, v16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll index 50184796b38f5..98188799fcca5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -418,8 +418,8 @@ define <4 x i64> @vwadd_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vsext.vf4 v11, v8 ; CHECK-NEXT: vwadd.vv v8, v10, v11 ; CHECK-NEXT: ret @@ -695,10 +695,10 @@ define <8 x i16> @vwadd_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwadd_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwadd.wv v8, v8, v9 ; CHECK-NEXT: ret @@ -750,10 +750,10 @@ define <4 x i32> @vwadd_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwadd_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwadd.wv v8, v8, v9 ; CHECK-NEXT: ret @@ -824,11 +824,11 @@ define <2 x i64> @vwadd_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; RV32-LABEL: vwadd_vx_v2i64_i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: lw a2, 0(a1) -; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: lw a0, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vlse64.v v8, (a0), zero @@ -838,10 +838,10 @@ define <2 x i64> @vwadd_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwadd_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: vmv.v.x v8, a0 ; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwadd.wv v8, v8, v9 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll index 98f246b8741dc..b553019568b4f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -418,8 +418,8 @@ define <4 x i64> @vwaddu_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vzext.vf4 v11, v8 ; CHECK-NEXT: vwaddu.vv v8, v10, v11 ; CHECK-NEXT: ret @@ -695,10 +695,10 @@ define <8 x i16> @vwaddu_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwaddu_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwaddu.wv v8, v8, v9 ; CHECK-NEXT: ret @@ -750,10 +750,10 @@ define <4 x i32> @vwaddu_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwaddu_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwaddu.wv v8, v8, v9 ; CHECK-NEXT: ret @@ -866,11 +866,11 @@ define <2 x i64> @vwaddu_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; RV32-LABEL: vwaddu_vx_v2i64_i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: lw a2, 0(a1) -; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: lw a0, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vlse64.v v8, (a0), zero @@ -880,10 +880,10 @@ define <2 x i64> @vwaddu_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwaddu_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: vmv.v.x v8, a0 ; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwaddu.wv v8, v8, v9 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll index eb7be14abe431..115113045548b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -454,8 +454,8 @@ define <4 x i64> @vwmul_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vsext.vf4 v11, v8 ; CHECK-NEXT: vwmul.vv v8, v10, v11 ; CHECK-NEXT: ret @@ -859,11 +859,11 @@ define <2 x i64> @vwmul_vx_v2i64_i64(ptr %x, ptr %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lw a2, 0(a1) -; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: lw a0, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vlse64.v v9, (a0), zero diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll index 8626b25a9d323..ce84e9fa0cbfd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -446,8 +446,8 @@ define <4 x i64> @vwmulsu_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vsext.vf4 v11, v8 ; CHECK-NEXT: vwmulsu.vv v8, v11, v10 ; CHECK-NEXT: ret @@ -740,10 +740,10 @@ define <8 x i16> @vwmulsu_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwmulsu_vx_v8i16_i8_swap(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_vx_v8i16_i8_swap: ; CHECK: # %bb.0: -; CHECK-NEXT: lb a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vwmulsu.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll index 007b561a2247a..9adaefd37abab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll @@ -430,8 +430,8 @@ define <4 x i64> @vwmulu_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vzext.vf4 v11, v8 ; CHECK-NEXT: vwmulu.vv v8, v10, v11 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll index 382f00913cb41..36af235446425 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll @@ -41,8 +41,8 @@ define <8 x i64> @vwsubu_vv_mask_v8i32(<8 x i32> %x, <8 x i32> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 42 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0 ; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmslt.vx v0, v8, a0 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vwsubu.vv v12, v10, v8 ; CHECK-NEXT: vmv4r.v v8, v12 @@ -60,8 +60,8 @@ define <8 x i64> @vwsub_wv_mask_v8i32_nonzero(<8 x i32> %x, <8 x i64> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 42 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0 ; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vmslt.vx v0, v8, a0 ; CHECK-NEXT: vmerge.vvm v16, v10, v8, v0 ; CHECK-NEXT: vwsub.wv v8, v12, v16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll index 7a925165d9816..5d3e39f96d567 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -418,8 +418,8 @@ define <4 x i64> @vwsub_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vsext.vf4 v11, v8 ; CHECK-NEXT: vwsub.vv v8, v10, v11 ; CHECK-NEXT: ret @@ -677,10 +677,10 @@ define <16 x i64> @vwsub_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwsub_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v8i16_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lb a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vwsub.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x @@ -696,10 +696,10 @@ define <8 x i16> @vwsub_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwsub_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwsub.wv v8, v8, v9 ; CHECK-NEXT: ret @@ -715,10 +715,10 @@ define <8 x i16> @vwsub_vx_v8i16_i16(ptr %x, ptr %y) { define <4 x i32> @vwsub_vx_v4i32_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v4i32_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lb a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vwsub.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -734,10 +734,10 @@ define <4 x i32> @vwsub_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwsub_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v4i32_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vwsub.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -753,10 +753,10 @@ define <4 x i32> @vwsub_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwsub_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwsub.wv v8, v8, v9 ; CHECK-NEXT: ret @@ -772,10 +772,10 @@ define <4 x i32> @vwsub_vx_v4i32_i32(ptr %x, ptr %y) { define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind { ; CHECK-LABEL: vwsub_vx_v2i64_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lb a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vwsub.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x @@ -791,10 +791,10 @@ define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind { define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind { ; CHECK-LABEL: vwsub_vx_v2i64_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vwsub.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x @@ -810,10 +810,10 @@ define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind { define <2 x i64> @vwsub_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; CHECK-LABEL: vwsub_vx_v2i64_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vwsub.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x @@ -830,11 +830,11 @@ define <2 x i64> @vwsub_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; RV32-LABEL: vwsub_vx_v2i64_i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: lw a2, 0(a1) -; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: lw a0, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vlse64.v v8, (a0), zero @@ -844,10 +844,10 @@ define <2 x i64> @vwsub_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsub_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: vmv.v.x v8, a0 ; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwsub.wv v8, v8, v9 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll index 4c08a8c15a388..bbe1ba03bdb6d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -418,8 +418,8 @@ define <4 x i64> @vwsubu_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vzext.vf4 v11, v8 ; CHECK-NEXT: vwsubu.vv v8, v10, v11 ; CHECK-NEXT: ret @@ -677,10 +677,10 @@ define <16 x i64> @vwsubu_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwsubu_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v8i16_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lbu a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: lbu a0, 0(a1) +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vwsubu.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x @@ -696,10 +696,10 @@ define <8 x i16> @vwsubu_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwsubu_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwsubu.wv v8, v8, v9 ; CHECK-NEXT: ret @@ -715,10 +715,10 @@ define <8 x i16> @vwsubu_vx_v8i16_i16(ptr %x, ptr %y) { define <4 x i32> @vwsubu_vx_v4i32_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v4i32_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lbu a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: lbu a0, 0(a1) +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vwsubu.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -734,10 +734,10 @@ define <4 x i32> @vwsubu_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwsubu_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v4i32_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lhu a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: lhu a0, 0(a1) +; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: vwsubu.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -753,10 +753,10 @@ define <4 x i32> @vwsubu_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwsubu_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwsubu.wv v8, v8, v9 ; CHECK-NEXT: ret @@ -786,10 +786,10 @@ define <2 x i64> @vwsubu_vx_v2i64_i8(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsubu_vx_v2i64_i8: ; RV64: # %bb.0: -; RV64-NEXT: lbu a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: lbu a0, 0(a1) +; RV64-NEXT: vmv.v.x v10, a0 ; RV64-NEXT: vwsubu.vv v8, v10, v9 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x @@ -819,10 +819,10 @@ define <2 x i64> @vwsubu_vx_v2i64_i16(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsubu_vx_v2i64_i16: ; RV64: # %bb.0: -; RV64-NEXT: lhu a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: lhu a0, 0(a1) +; RV64-NEXT: vmv.v.x v10, a0 ; RV64-NEXT: vwsubu.vv v8, v10, v9 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x @@ -852,10 +852,10 @@ define <2 x i64> @vwsubu_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsubu_vx_v2i64_i32: ; RV64: # %bb.0: -; RV64-NEXT: lwu a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: lwu a0, 0(a1) +; RV64-NEXT: vmv.v.x v10, a0 ; RV64-NEXT: vwsubu.vv v8, v10, v9 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x @@ -872,11 +872,11 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; RV32-LABEL: vwsubu_vx_v2i64_i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: lw a2, 0(a1) -; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vle32.v v9, (a0) -; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: lw a0, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vlse64.v v8, (a0), zero @@ -886,10 +886,10 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsubu_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: vmv.v.x v8, a0 ; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwsubu.wv v8, v8, v9 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll index df90dae379c06..b38701ebd3448 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zext-vp.ll @@ -151,9 +151,9 @@ declare <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32>, <32 x i1>, i32) define <32 x i64> @vzext_v32i64_v32i32(<32 x i32> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vzext_v32i64_v32i32: ; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v16, v0, 2 +; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB12_2 ; CHECK-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll index b7661bd826fed..ad973b72b271f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll @@ -405,8 +405,8 @@ define @ceil_nxv1f32_to_si8( %x) { ; RV32-NEXT: vfabs.v v9, v8 ; RV32-NEXT: lui a0, 307200 ; RV32-NEXT: fmv.w.x fa5, a0 -; RV32-NEXT: vmflt.vf v0, v9, fa5 ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v9, fa5 ; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -424,8 +424,8 @@ define @ceil_nxv1f32_to_si8( %x) { ; RV64-NEXT: vfabs.v v9, v8 ; RV64-NEXT: lui a0, 307200 ; RV64-NEXT: fmv.w.x fa5, a0 -; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -448,8 +448,8 @@ define @ceil_nxv1f32_to_ui8( %x) { ; RV32-NEXT: vfabs.v v9, v8 ; RV32-NEXT: lui a0, 307200 ; RV32-NEXT: fmv.w.x fa5, a0 -; RV32-NEXT: vmflt.vf v0, v9, fa5 ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v9, fa5 ; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -467,8 +467,8 @@ define @ceil_nxv1f32_to_ui8( %x) { ; RV64-NEXT: vfabs.v v9, v8 ; RV64-NEXT: lui a0, 307200 ; RV64-NEXT: fmv.w.x fa5, a0 -; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -631,8 +631,8 @@ define @ceil_nxv4f32_to_si8( %x) { ; RV32-NEXT: vfabs.v v10, v8 ; RV32-NEXT: lui a0, 307200 ; RV32-NEXT: fmv.w.x fa5, a0 -; RV32-NEXT: vmflt.vf v0, v10, fa5 ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v10, fa5 ; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -650,8 +650,8 @@ define @ceil_nxv4f32_to_si8( %x) { ; RV64-NEXT: vfabs.v v10, v8 ; RV64-NEXT: lui a0, 307200 ; RV64-NEXT: fmv.w.x fa5, a0 -; RV64-NEXT: vmflt.vf v0, v10, fa5 ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v10, fa5 ; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -674,8 +674,8 @@ define @ceil_nxv4f32_to_ui8( %x) { ; RV32-NEXT: vfabs.v v10, v8 ; RV32-NEXT: lui a0, 307200 ; RV32-NEXT: fmv.w.x fa5, a0 -; RV32-NEXT: vmflt.vf v0, v10, fa5 ; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vmflt.vf v0, v10, fa5 ; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t ; RV32-NEXT: fsrm a0 ; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -693,8 +693,8 @@ define @ceil_nxv4f32_to_ui8( %x) { ; RV64-NEXT: vfabs.v v10, v8 ; RV64-NEXT: lui a0, 307200 ; RV64-NEXT: fmv.w.x fa5, a0 -; RV64-NEXT: vmflt.vf v0, v10, fa5 ; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vmflt.vf v0, v10, fa5 ; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t ; RV64-NEXT: fsrm a0 ; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll index f9b5095c9af1d..c8b5487b3aee6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll @@ -22,12 +22,12 @@ define @vp_floor_nxv1bf16( %va, @vp_floor_nxv1bf16_unmasked( % ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -76,12 +76,12 @@ define @vp_floor_nxv2bf16( %va, @vp_floor_nxv2bf16_unmasked( % ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -130,12 +130,12 @@ define @vp_floor_nxv4bf16( %va, @vp_floor_nxv4bf16_unmasked( % ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -184,12 +184,12 @@ define @vp_floor_nxv8bf16( %va, @vp_floor_nxv8bf16_unmasked( % ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -238,12 +238,12 @@ define @vp_floor_nxv16bf16( %va, @vp_floor_nxv16bf16_unmasked( @vp_floor_nxv32bf16( %va, @vp_floor_nxv32bf16( %va, @vp_floor_nxv32bf16( %va, @vp_floor_nxv32bf16_unmasked( @vp_floor_nxv32bf16_unmasked( @vp_floor_nxv32bf16_unmasked( @llvm.vp.floor.nxv1f16(, @vp_floor_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a0) +; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -461,12 +461,12 @@ define @vp_floor_nxv1f16( %va, @vp_floor_nxv1f16( %va, @vp_floor_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -503,11 +503,11 @@ define @vp_floor_nxv1f16_unmasked( %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -525,13 +525,13 @@ declare @llvm.vp.floor.nxv2f16(, @vp_floor_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a0) +; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -547,12 +547,12 @@ define @vp_floor_nxv2f16( %va, @vp_floor_nxv2f16( %va, @vp_floor_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -589,11 +589,11 @@ define @vp_floor_nxv2f16_unmasked( %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -611,13 +611,13 @@ declare @llvm.vp.floor.nxv4f16(, @vp_floor_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a0) +; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -633,12 +633,12 @@ define @vp_floor_nxv4f16( %va, @vp_floor_nxv4f16( %va, @vp_floor_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -675,11 +675,11 @@ define @vp_floor_nxv4f16_unmasked( %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -699,12 +699,12 @@ define @vp_floor_nxv8f16( %va, @vp_floor_nxv8f16( %va, @vp_floor_nxv8f16( %va, @vp_floor_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -763,11 +763,11 @@ define @vp_floor_nxv8f16_unmasked( %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -787,12 +787,12 @@ define @vp_floor_nxv16f16( %va, @vp_floor_nxv16f16( %va, @vp_floor_nxv16f16( %va, @vp_floor_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -851,11 +851,11 @@ define @vp_floor_nxv16f16_unmasked( %va ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -875,12 +875,12 @@ define @vp_floor_nxv32f16( %va, @vp_floor_nxv32f16( %va, @vp_floor_nxv32f16( %va, @vp_floor_nxv32f16( %va, @vp_floor_nxv32f16( %va, @vp_floor_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a0) ; ZVFH-NEXT: fsrmi a0, 2 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -995,11 +995,12 @@ define @vp_floor_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; ZVFHMIN-NEXT: vmset.m v16 ; ZVFHMIN-NEXT: lui a3, 307200 +; ZVFHMIN-NEXT: fsrmi a4, 2 ; ZVFHMIN-NEXT: slli a1, a2, 1 ; ZVFHMIN-NEXT: srli a2, a2, 2 ; ZVFHMIN-NEXT: fmv.w.x fa5, a3 ; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 ; ZVFHMIN-NEXT: sltu a2, a0, a3 ; ZVFHMIN-NEXT: vmv1r.v v17, v16 @@ -1014,11 +1015,10 @@ define @vp_floor_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v17, v8, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a2, 2 ; ZVFHMIN-NEXT: vmv1r.v v0, v17 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t -; ZVFHMIN-NEXT: fsrm a2 +; ZVFHMIN-NEXT: fsrm a4 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t @@ -1033,10 +1033,10 @@ define @vp_floor_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 2 ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -1064,9 +1064,9 @@ define @vp_floor_nxv1f32( %va, @vp_floor_nxv1f32_unmasked( %va, ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1106,9 +1106,9 @@ define @vp_floor_nxv2f32( %va, @vp_floor_nxv2f32_unmasked( %va, ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1149,9 +1149,9 @@ define @vp_floor_nxv4f32( %va, @vp_floor_nxv4f32_unmasked( %va, ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -1193,9 +1193,9 @@ define @vp_floor_nxv8f32( %va, @vp_floor_nxv8f32_unmasked( %va, ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1237,9 +1237,9 @@ define @vp_floor_nxv16f32( %va, @vp_floor_nxv16f32_unmasked( % ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1276,13 +1276,13 @@ declare @llvm.vp.floor.nxv1f64(, @vp_floor_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a0) +; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -1297,12 +1297,12 @@ define @vp_floor_nxv1f64( %va, @vp_floor_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1320,12 +1320,12 @@ define @vp_floor_nxv2f64( %va, @vp_floor_nxv2f64( %va, @vp_floor_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -1364,12 +1364,12 @@ define @vp_floor_nxv4f64( %va, @vp_floor_nxv4f64( %va, @vp_floor_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1408,12 +1408,12 @@ define @vp_floor_nxv7f64( %va, @vp_floor_nxv7f64( %va, @vp_floor_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1452,12 +1452,12 @@ define @vp_floor_nxv8f64( %va, @vp_floor_nxv8f64( %va, @vp_floor_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a0) ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1498,59 +1498,66 @@ define @vp_floor_nxv16f64( %va, @vp_floor_nxv16f64_unmasked( ; CHECK-NEXT: sltu a2, a0, a3 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: fsrmi a3, 2 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a2, 2 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: fsrm a3 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t @@ -1585,8 +1592,8 @@ define @vp_floor_nxv16f64_unmasked( ; CHECK-NEXT: .LBB45_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 2 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll index 7fad68dbfbbda..42903f0d85e32 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll @@ -22,16 +22,14 @@ define @vfmax_nxv1bf16_vv( %a, @vfmax_nxv2bf16_vv( %a, @vfmax_nxv4bf16_vv( %a, @vfmax_nxv8bf16_vv( %a, @vfmax_nxv1f16_vv( %a, @vfmax_nxv2f16_vv( %a, @vfmax_nxv4f16_vv( %a, @vfmax_nxv8f16_vv( %a, @vfmax_nxv1f16_vv_nnana( %a, @vfmax_nxv1f16_vv_nnanb( %a, @vfmax_vv_nxv1bf16_unmasked( % ; CHECK-LABEL: vfmax_vv_nxv1bf16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v9, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0 -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v9, v10, v9, v0 +; CHECK-NEXT: vfmax.vv v9, v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 ; CHECK-NEXT: ret @@ -87,16 +85,14 @@ define @vfmax_vv_nxv2bf16_unmasked( % ; CHECK-LABEL: vfmax_vv_nxv2bf16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v9, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0 -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v9, v10, v9, v0 +; CHECK-NEXT: vfmax.vv v9, v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 ; CHECK-NEXT: ret @@ -134,15 +130,13 @@ define @vfmax_vv_nxv4bf16_unmasked( % ; CHECK-LABEL: vfmax_vv_nxv4bf16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v12, v10, v0 +; CHECK-NEXT: vmerge.vvm v8, v12, v10, v0 +; CHECK-NEXT: vmfeq.vv v0, v10, v10 +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vfmax.vv v10, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 @@ -181,15 +175,13 @@ define @vfmax_vv_nxv8bf16_unmasked( % ; CHECK-LABEL: vfmax_vv_nxv8bf16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v16, v12, v0 +; CHECK-NEXT: vmerge.vvm v8, v16, v12, v0 +; CHECK-NEXT: vmfeq.vv v0, v12, v12 +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vfmax.vv v12, v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 @@ -633,16 +625,14 @@ define @vfmax_vv_nxv1f16_unmasked( %va, < ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -701,16 +691,14 @@ define @vfmax_vv_nxv2f16_unmasked( %va, < ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -771,15 +759,13 @@ define @vfmax_vv_nxv4f16_unmasked( %va, < ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 +; ZVFHMIN-NEXT: vmerge.vvm v10, v10, v12, v0 ; ZVFHMIN-NEXT: vfmax.vv v10, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 @@ -843,15 +829,13 @@ define @vfmax_vv_nxv8f16_unmasked( %va, < ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 -; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 +; ZVFHMIN-NEXT: vmerge.vvm v12, v12, v16, v0 ; ZVFHMIN-NEXT: vfmax.vv v12, v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 @@ -1615,8 +1599,6 @@ define @vfmax_vv_nxv16f64( %va, @vfmax_vv_nxv16f64( %va, @vfmax_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 @@ -1726,45 +1702,36 @@ define @vfmax_vv_nxv16f64_unmasked( ; CHECK-NEXT: and a3, a3, a4 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vl8re64.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 -; CHECK-NEXT: vfmax.vv v8, v16, v8 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfmax.vv v16, v16, v8 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: bltu a2, a1, .LBB41_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: .LBB41_2: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v8, v8 -; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: vfmax.vv v8, v8, v24 -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll index 8cae0bbc03c8e..3dc02bb4a5a11 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll @@ -22,16 +22,14 @@ define @vfmin_nxv1bf16_vv( %a, @vfmin_nxv2bf16_vv( %a, @vfmin_nxv4bf16_vv( %a, @vfmin_nxv8bf16_vv( %a, @vfmin_nxv1f16_vv( %a, @vfmin_nxv2f16_vv( %a, @vfmin_nxv4f16_vv( %a, @vfmin_nxv8f16_vv( %a, @vfmin_nxv1f16_vv_nnana( %a, @vfmin_nxv1f16_vv_nnanb( %a, @vfmin_vv_nxv1bf16_unmasked( % ; CHECK-LABEL: vfmin_vv_nxv1bf16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v9, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0 -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v9, v10, v9, v0 +; CHECK-NEXT: vfmin.vv v9, v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 ; CHECK-NEXT: ret @@ -87,16 +85,14 @@ define @vfmin_vv_nxv2bf16_unmasked( % ; CHECK-LABEL: vfmin_vv_nxv2bf16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vmfeq.vv v0, v9, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vmerge.vvm v9, v10, v8, v0 -; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v9, v10, v9, v0 +; CHECK-NEXT: vfmin.vv v9, v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 ; CHECK-NEXT: ret @@ -134,15 +130,13 @@ define @vfmin_vv_nxv4bf16_unmasked( % ; CHECK-LABEL: vfmin_vv_nxv4bf16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v9 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v12, v10, v0 +; CHECK-NEXT: vmerge.vvm v8, v12, v10, v0 +; CHECK-NEXT: vmfeq.vv v0, v10, v10 +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vfmin.vv v10, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 @@ -181,15 +175,13 @@ define @vfmin_vv_nxv8bf16_unmasked( % ; CHECK-LABEL: vfmin_vv_nxv8bf16_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v10 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v10 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v16, v12, v0 +; CHECK-NEXT: vmerge.vvm v8, v16, v12, v0 +; CHECK-NEXT: vmfeq.vv v0, v12, v12 +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vfmin.vv v12, v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 @@ -633,16 +625,14 @@ define @vfmin_vv_nxv1f16_unmasked( %va, < ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -701,16 +691,14 @@ define @vfmin_vv_nxv2f16_unmasked( %va, < ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 +; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v10, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v8, v0 -; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v8, v10, v0 -; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 +; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v9, v0 +; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 ; ZVFHMIN-NEXT: ret @@ -771,15 +759,13 @@ define @vfmin_vv_nxv4f16_unmasked( %va, < ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v9 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vmerge.vvm v10, v12, v10, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 +; ZVFHMIN-NEXT: vmerge.vvm v10, v10, v12, v0 ; ZVFHMIN-NEXT: vfmin.vv v10, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 @@ -843,15 +829,13 @@ define @vfmin_vv_nxv8f16_unmasked( %va, < ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16_unmasked: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v10 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0 ; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 -; ZVFHMIN-NEXT: vmerge.vvm v12, v16, v12, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 +; ZVFHMIN-NEXT: vmerge.vvm v12, v12, v16, v0 ; ZVFHMIN-NEXT: vfmin.vv v12, v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 @@ -1615,8 +1599,6 @@ define @vfmin_vv_nxv16f64( %va, @vfmin_vv_nxv16f64( %va, @vfmin_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 @@ -1726,45 +1702,36 @@ define @vfmin_vv_nxv16f64_unmasked( ; CHECK-NEXT: and a3, a3, a4 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vl8re64.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 -; CHECK-NEXT: vfmin.vv v8, v16, v8 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfmin.vv v16, v16, v8 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: bltu a2, a1, .LBB41_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: .LBB41_2: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmfeq.vv v0, v8, v8 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v8, v8 -; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: vfmin.vv v8, v8, v24 -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll index 7a4695d1c25c1..3276c68b9b6ea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll @@ -12,17 +12,17 @@ define @nearbyint_nxv1f16( %v) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv1f16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -36,17 +36,17 @@ define @nearbyint_nxv2f16( %v) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv2f16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -60,17 +60,17 @@ define @nearbyint_nxv4f16( %v) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv4f16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -84,17 +84,17 @@ define @nearbyint_nxv8f16( %v) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv8f16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -108,17 +108,17 @@ define @nearbyint_nxv16f16( %v) strictf ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv16f16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -132,17 +132,17 @@ define @nearbyint_nxv32f16( %v) strictf ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv32f16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -158,15 +158,15 @@ define @nearbyint_nxv1f32( %v) strictfp ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv1f32( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -182,15 +182,15 @@ define @nearbyint_nxv2f32( %v) strictfp ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv2f32( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -206,15 +206,15 @@ define @nearbyint_nxv4f32( %v) strictfp ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv4f32( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -230,15 +230,15 @@ define @nearbyint_nxv8f32( %v) strictfp ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv8f32( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -254,15 +254,15 @@ define @nearbyint_nxv16f32( %v) stric ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv16f32( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -276,17 +276,17 @@ define @nearbyint_nxv1f64( %v) strict ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv1f64( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -300,17 +300,17 @@ define @nearbyint_nxv2f64( %v) strict ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv2f64( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -324,17 +324,17 @@ define @nearbyint_nxv4f64( %v) strict ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv4f64( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -348,17 +348,17 @@ define @nearbyint_nxv8f64( %v) strict ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv8f64( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll index 807a3e460b153..78760234fa493 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll @@ -18,18 +18,18 @@ define @nearbyint_nxv1bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv1bf16( %x) ret %a @@ -41,18 +41,18 @@ define @nearbyint_nxv2bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9 -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv2bf16( %x) ret %a @@ -64,18 +64,18 @@ define @nearbyint_nxv4bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10 -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv4bf16( %x) ret %a @@ -87,18 +87,18 @@ define @nearbyint_nxv8bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv8bf16( %x) ret %a @@ -110,18 +110,18 @@ define @nearbyint_nxv16bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv16bf16( %x) ret %a @@ -130,50 +130,35 @@ define @nearbyint_nxv16bf16( %x) { define @nearbyint_nxv32bf16( %x) { ; CHECK-LABEL: nearbyint_nxv32bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 +; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: vfabs.v v8, v24 -; CHECK-NEXT: vmflt.vf v7, v8, fa5 -; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: vmflt.vf v0, v8, fa5 ; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv32bf16( %x) ret %a @@ -182,17 +167,17 @@ define @nearbyint_nxv32bf16( %x) { define @nearbyint_nxv1f16( %x) { ; ZVFH-LABEL: nearbyint_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: nearbyint_nxv1f16: @@ -200,18 +185,18 @@ define @nearbyint_nxv1f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv1f16( %x) ret %a @@ -221,17 +206,17 @@ declare @llvm.nearbyint.nxv1f16() define @nearbyint_nxv2f16( %x) { ; ZVFH-LABEL: nearbyint_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: nearbyint_nxv2f16: @@ -239,18 +224,18 @@ define @nearbyint_nxv2f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv2f16( %x) ret %a @@ -260,17 +245,17 @@ declare @llvm.nearbyint.nxv2f16() define @nearbyint_nxv4f16( %x) { ; ZVFH-LABEL: nearbyint_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: nearbyint_nxv4f16: @@ -278,18 +263,18 @@ define @nearbyint_nxv4f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv4f16( %x) ret %a @@ -299,17 +284,17 @@ declare @llvm.nearbyint.nxv4f16() define @nearbyint_nxv8f16( %x) { ; ZVFH-LABEL: nearbyint_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: nearbyint_nxv8f16: @@ -317,18 +302,18 @@ define @nearbyint_nxv8f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv8f16( %x) ret %a @@ -338,17 +323,17 @@ declare @llvm.nearbyint.nxv8f16() define @nearbyint_nxv16f16( %x) { ; ZVFH-LABEL: nearbyint_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: nearbyint_nxv16f16: @@ -356,18 +341,18 @@ define @nearbyint_nxv16f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv16f16( %x) ret %a @@ -377,65 +362,50 @@ declare @llvm.nearbyint.nxv16f16() define @nearbyint_nxv32f16( %x) { ; ZVFH-LABEL: nearbyint_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: nearbyint_nxv32f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: frflags a0 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfabs.v v24, v16 +; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: vfabs.v v8, v24 -; ZVFHMIN-NEXT: vmflt.vf v7, v8, fa5 -; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t -; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 ; ZVFHMIN-NEXT: fsflags a0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu -; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: frflags a0 -; ZVFHMIN-NEXT: vmv1r.v v0, v7 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 -; ZVFHMIN-NEXT: fsflags a0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv32f16( %x) ret %a @@ -449,13 +419,13 @@ define @nearbyint_nxv1f32( %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv1f32( %x) ret %a @@ -469,13 +439,13 @@ define @nearbyint_nxv2f32( %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv2f32( %x) ret %a @@ -489,13 +459,13 @@ define @nearbyint_nxv4f32( %x) { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv4f32( %x) ret %a @@ -509,13 +479,13 @@ define @nearbyint_nxv8f32( %x) { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv8f32( %x) ret %a @@ -529,13 +499,13 @@ define @nearbyint_nxv16f32( %x) { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv16f32( %x) ret %a @@ -545,17 +515,17 @@ declare @llvm.nearbyint.nxv16f32() define @nearbyint_nxv1f64( %x) { ; CHECK-LABEL: nearbyint_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv1f64( %x) ret %a @@ -565,17 +535,17 @@ declare @llvm.nearbyint.nxv1f64() define @nearbyint_nxv2f64( %x) { ; CHECK-LABEL: nearbyint_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv2f64( %x) ret %a @@ -585,17 +555,17 @@ declare @llvm.nearbyint.nxv2f64() define @nearbyint_nxv4f64( %x) { ; CHECK-LABEL: nearbyint_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv4f64( %x) ret %a @@ -605,17 +575,17 @@ declare @llvm.nearbyint.nxv4f64() define @nearbyint_nxv8f64( %x) { ; CHECK-LABEL: nearbyint_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %a = call @llvm.nearbyint.nxv8f64( %x) ret %a diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll index 5e657a93ec0d6..a420e9ecee563 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll @@ -7,10 +7,10 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) { ; RV32-LABEL: test: ; RV32: # %bb.0: # %entry -; RV32-NEXT: addi a3, a2, 1 -; RV32-NEXT: th.lbib a4, (a1), -1, 0 +; RV32-NEXT: th.lbib a3, (a1), -1, 0 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: addi a3, a2, 1 ; RV32-NEXT: vmv.s.x v9, zero ; RV32-NEXT: vsetvli zero, a3, e8, mf2, tu, ma ; RV32-NEXT: vslideup.vx v8, v9, a2 @@ -33,10 +33,10 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) { ; ; RV64-LABEL: test: ; RV64: # %bb.0: # %entry -; RV64-NEXT: addi a3, a2, 1 -; RV64-NEXT: th.lbib a4, (a1), -1, 0 +; RV64-NEXT: th.lbib a3, (a1), -1, 0 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmv.v.x v8, a4 +; RV64-NEXT: vmv.v.x v8, a3 +; RV64-NEXT: addi a3, a2, 1 ; RV64-NEXT: vmv.s.x v9, zero ; RV64-NEXT: vsetvli zero, a3, e8, mf2, tu, ma ; RV64-NEXT: vslideup.vx v8, v9, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-vector-cmp.ll b/llvm/test/CodeGen/RISCV/rvv/fold-vector-cmp.ll index e24b23c9b2d32..7504c570e6c7a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fold-vector-cmp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fold-vector-cmp.ll @@ -15,11 +15,11 @@ define i32 @test(i32 %call.i) { ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.v.x v8, a0 ; CHECK-V-NEXT: lui a0, 524288 +; CHECK-V-NEXT: vmv.v.i v9, 0 ; CHECK-V-NEXT: vslide1down.vx v8, v8, a0 ; CHECK-V-NEXT: addi a0, a0, 2 ; CHECK-V-NEXT: vmslt.vx v0, v8, a0 -; CHECK-V-NEXT: vmv.v.i v8, 0 -; CHECK-V-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-V-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-V-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-V-NEXT: vmv.x.s a0, v8 ; CHECK-V-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index f6598606b09f1..052a10e0adcdc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -9,10 +9,10 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-NOV-LABEL: stest_f64i32: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz ; CHECK-NOV-NEXT: fcvt.l.d a1, fa1, rtz ; CHECK-NOV-NEXT: lui a2, 524288 ; CHECK-NOV-NEXT: addiw a3, a2, -1 -; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz ; CHECK-NOV-NEXT: bge a1, a3, .LBB0_5 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: bge a0, a3, .LBB0_6 @@ -55,10 +55,10 @@ entry: define <2 x i32> @utest_f64i32(<2 x double> %x) { ; CHECK-NOV-LABEL: utest_f64i32: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.lu.d a1, fa1, rtz ; CHECK-NOV-NEXT: fcvt.lu.d a0, fa0, rtz ; CHECK-NOV-NEXT: li a2, -1 ; CHECK-NOV-NEXT: srli a2, a2, 32 -; CHECK-NOV-NEXT: fcvt.lu.d a1, fa1, rtz ; CHECK-NOV-NEXT: bgeu a0, a2, .LBB1_3 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: bgeu a1, a2, .LBB1_4 @@ -89,10 +89,10 @@ entry: define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-NOV-LABEL: ustest_f64i32: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz ; CHECK-NOV-NEXT: fcvt.l.d a1, fa1, rtz ; CHECK-NOV-NEXT: li a2, -1 ; CHECK-NOV-NEXT: srli a2, a2, 32 -; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz ; CHECK-NOV-NEXT: blt a1, a2, .LBB2_2 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: mv a1, a2 @@ -130,14 +130,14 @@ entry: define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-NOV-LABEL: stest_f32i32: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a1, fa3, rtz +; CHECK-NOV-NEXT: fcvt.l.s a1, fa2, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fa3, rtz ; CHECK-NOV-NEXT: lui a3, 524288 ; CHECK-NOV-NEXT: addiw a6, a3, -1 -; CHECK-NOV-NEXT: fcvt.l.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a6, .LBB3_10 +; CHECK-NOV-NEXT: bge a2, a6, .LBB3_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a4, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a6, .LBB3_11 +; CHECK-NOV-NEXT: bge a1, a6, .LBB3_11 ; CHECK-NOV-NEXT: .LBB3_2: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz ; CHECK-NOV-NEXT: bge a4, a6, .LBB3_12 @@ -148,23 +148,23 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-NOV-NEXT: .LBB3_5: # %entry ; CHECK-NOV-NEXT: bge a3, a4, .LBB3_15 ; CHECK-NOV-NEXT: .LBB3_6: # %entry -; CHECK-NOV-NEXT: bge a3, a2, .LBB3_16 +; CHECK-NOV-NEXT: bge a3, a1, .LBB3_16 ; CHECK-NOV-NEXT: .LBB3_7: # %entry -; CHECK-NOV-NEXT: blt a3, a1, .LBB3_9 +; CHECK-NOV-NEXT: blt a3, a2, .LBB3_9 ; CHECK-NOV-NEXT: .LBB3_8: # %entry -; CHECK-NOV-NEXT: lui a1, 524288 +; CHECK-NOV-NEXT: lui a2, 524288 ; CHECK-NOV-NEXT: .LBB3_9: # %entry ; CHECK-NOV-NEXT: sw a5, 0(a0) ; CHECK-NOV-NEXT: sw a4, 4(a0) -; CHECK-NOV-NEXT: sw a2, 8(a0) -; CHECK-NOV-NEXT: sw a1, 12(a0) +; CHECK-NOV-NEXT: sw a1, 8(a0) +; CHECK-NOV-NEXT: sw a2, 12(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB3_10: # %entry -; CHECK-NOV-NEXT: mv a1, a6 +; CHECK-NOV-NEXT: mv a2, a6 ; CHECK-NOV-NEXT: fcvt.l.s a4, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a6, .LBB3_2 +; CHECK-NOV-NEXT: blt a1, a6, .LBB3_2 ; CHECK-NOV-NEXT: .LBB3_11: # %entry -; CHECK-NOV-NEXT: mv a2, a6 +; CHECK-NOV-NEXT: mv a1, a6 ; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz ; CHECK-NOV-NEXT: blt a4, a6, .LBB3_3 ; CHECK-NOV-NEXT: .LBB3_12: # %entry @@ -178,10 +178,10 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-NOV-NEXT: blt a3, a4, .LBB3_6 ; CHECK-NOV-NEXT: .LBB3_15: # %entry ; CHECK-NOV-NEXT: lui a4, 524288 -; CHECK-NOV-NEXT: blt a3, a2, .LBB3_7 +; CHECK-NOV-NEXT: blt a3, a1, .LBB3_7 ; CHECK-NOV-NEXT: .LBB3_16: # %entry -; CHECK-NOV-NEXT: lui a2, 524288 -; CHECK-NOV-NEXT: bge a3, a1, .LBB3_8 +; CHECK-NOV-NEXT: lui a1, 524288 +; CHECK-NOV-NEXT: bge a3, a2, .LBB3_8 ; CHECK-NOV-NEXT: j .LBB3_9 ; ; CHECK-V-LABEL: stest_f32i32: @@ -203,14 +203,14 @@ entry: define <4 x i32> @utest_f32i32(<4 x float> %x) { ; CHECK-NOV-LABEL: utest_f32i32: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a1, fa0, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a1, fa1, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a2, fa0, rtz ; CHECK-NOV-NEXT: li a3, -1 ; CHECK-NOV-NEXT: srli a3, a3, 32 -; CHECK-NOV-NEXT: fcvt.lu.s a2, fa1, rtz -; CHECK-NOV-NEXT: bgeu a1, a3, .LBB4_6 +; CHECK-NOV-NEXT: bgeu a2, a3, .LBB4_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.lu.s a4, fa2, rtz -; CHECK-NOV-NEXT: bgeu a2, a3, .LBB4_7 +; CHECK-NOV-NEXT: bgeu a1, a3, .LBB4_7 ; CHECK-NOV-NEXT: .LBB4_2: # %entry ; CHECK-NOV-NEXT: fcvt.lu.s a5, fa3, rtz ; CHECK-NOV-NEXT: bgeu a4, a3, .LBB4_8 @@ -219,17 +219,17 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) { ; CHECK-NOV-NEXT: .LBB4_4: # %entry ; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB4_5: # %entry -; CHECK-NOV-NEXT: sw a1, 0(a0) -; CHECK-NOV-NEXT: sw a2, 4(a0) +; CHECK-NOV-NEXT: sw a2, 0(a0) +; CHECK-NOV-NEXT: sw a1, 4(a0) ; CHECK-NOV-NEXT: sw a4, 8(a0) ; CHECK-NOV-NEXT: sw a5, 12(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB4_6: # %entry -; CHECK-NOV-NEXT: mv a1, a3 +; CHECK-NOV-NEXT: mv a2, a3 ; CHECK-NOV-NEXT: fcvt.lu.s a4, fa2, rtz -; CHECK-NOV-NEXT: bltu a2, a3, .LBB4_2 +; CHECK-NOV-NEXT: bltu a1, a3, .LBB4_2 ; CHECK-NOV-NEXT: .LBB4_7: # %entry -; CHECK-NOV-NEXT: mv a2, a3 +; CHECK-NOV-NEXT: mv a1, a3 ; CHECK-NOV-NEXT: fcvt.lu.s a5, fa3, rtz ; CHECK-NOV-NEXT: bltu a4, a3, .LBB4_3 ; CHECK-NOV-NEXT: .LBB4_8: # %entry @@ -254,10 +254,10 @@ entry: define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i32: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.l.s a2, fa2, rtz ; CHECK-NOV-NEXT: fcvt.l.s a1, fa3, rtz ; CHECK-NOV-NEXT: li a4, -1 ; CHECK-NOV-NEXT: srli a4, a4, 32 -; CHECK-NOV-NEXT: fcvt.l.s a2, fa2, rtz ; CHECK-NOV-NEXT: bge a1, a4, .LBB5_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz @@ -341,12 +341,12 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 ; CHECK-NOV-NEXT: .cfi_remember_state +; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: lhu s1, 0(a1) ; CHECK-NOV-NEXT: lhu s2, 8(a1) -; CHECK-NOV-NEXT: lhu a2, 16(a1) +; CHECK-NOV-NEXT: lhu a0, 16(a1) ; CHECK-NOV-NEXT: lhu s3, 24(a1) -; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 @@ -355,8 +355,8 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-NOV-NEXT: lui a1, 524288 @@ -454,11 +454,11 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -473,11 +473,11 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -541,22 +541,22 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 ; CHECK-NOV-NEXT: .cfi_remember_state -; CHECK-NOV-NEXT: lhu s1, 0(a1) -; CHECK-NOV-NEXT: lhu a2, 8(a1) -; CHECK-NOV-NEXT: lhu s2, 16(a1) -; CHECK-NOV-NEXT: lhu s3, 24(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: lhu s2, 0(a1) +; CHECK-NOV-NEXT: lhu a0, 8(a1) +; CHECK-NOV-NEXT: lhu s1, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.lu.s s1, fs2, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-NOV-NEXT: li a1, -1 @@ -634,11 +634,11 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -653,11 +653,11 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -719,12 +719,12 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 ; CHECK-NOV-NEXT: .cfi_remember_state +; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: lhu s1, 0(a1) ; CHECK-NOV-NEXT: lhu s2, 8(a1) -; CHECK-NOV-NEXT: lhu a2, 16(a1) +; CHECK-NOV-NEXT: lhu a0, 16(a1) ; CHECK-NOV-NEXT: lhu s3, 24(a1) -; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 @@ -733,8 +733,8 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-NOV-NEXT: li a2, -1 @@ -824,11 +824,11 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -843,11 +843,11 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -895,10 +895,10 @@ entry: define <2 x i16> @stest_f64i16(<2 x double> %x) { ; CHECK-NOV-LABEL: stest_f64i16: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz ; CHECK-NOV-NEXT: fcvt.w.d a1, fa1, rtz ; CHECK-NOV-NEXT: lui a2, 8 ; CHECK-NOV-NEXT: addiw a2, a2, -1 -; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz ; CHECK-NOV-NEXT: bge a1, a2, .LBB9_5 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: bge a0, a2, .LBB9_6 @@ -943,10 +943,10 @@ entry: define <2 x i16> @utest_f64i16(<2 x double> %x) { ; CHECK-NOV-LABEL: utest_f64i16: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.wu.d a1, fa1, rtz ; CHECK-NOV-NEXT: fcvt.wu.d a0, fa0, rtz ; CHECK-NOV-NEXT: lui a2, 16 ; CHECK-NOV-NEXT: addiw a2, a2, -1 -; CHECK-NOV-NEXT: fcvt.wu.d a1, fa1, rtz ; CHECK-NOV-NEXT: bgeu a0, a2, .LBB10_3 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: bgeu a1, a2, .LBB10_4 @@ -977,10 +977,10 @@ entry: define <2 x i16> @ustest_f64i16(<2 x double> %x) { ; CHECK-NOV-LABEL: ustest_f64i16: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz ; CHECK-NOV-NEXT: fcvt.w.d a1, fa1, rtz ; CHECK-NOV-NEXT: lui a2, 16 ; CHECK-NOV-NEXT: addiw a2, a2, -1 -; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz ; CHECK-NOV-NEXT: blt a1, a2, .LBB11_2 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: mv a1, a2 @@ -1018,14 +1018,14 @@ entry: define <4 x i16> @stest_f32i16(<4 x float> %x) { ; CHECK-NOV-LABEL: stest_f32i16: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.w.s a1, fa3, rtz +; CHECK-NOV-NEXT: fcvt.w.s a1, fa2, rtz +; CHECK-NOV-NEXT: fcvt.w.s a2, fa3, rtz ; CHECK-NOV-NEXT: lui a5, 8 ; CHECK-NOV-NEXT: addiw a5, a5, -1 -; CHECK-NOV-NEXT: fcvt.w.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a5, .LBB12_10 +; CHECK-NOV-NEXT: bge a2, a5, .LBB12_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a5, .LBB12_11 +; CHECK-NOV-NEXT: bge a1, a5, .LBB12_11 ; CHECK-NOV-NEXT: .LBB12_2: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a4, fa0, rtz ; CHECK-NOV-NEXT: bge a3, a5, .LBB12_12 @@ -1037,23 +1037,23 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) { ; CHECK-NOV-NEXT: .LBB12_5: # %entry ; CHECK-NOV-NEXT: bge a5, a3, .LBB12_15 ; CHECK-NOV-NEXT: .LBB12_6: # %entry -; CHECK-NOV-NEXT: bge a5, a2, .LBB12_16 +; CHECK-NOV-NEXT: bge a5, a1, .LBB12_16 ; CHECK-NOV-NEXT: .LBB12_7: # %entry -; CHECK-NOV-NEXT: blt a5, a1, .LBB12_9 +; CHECK-NOV-NEXT: blt a5, a2, .LBB12_9 ; CHECK-NOV-NEXT: .LBB12_8: # %entry -; CHECK-NOV-NEXT: lui a1, 1048568 +; CHECK-NOV-NEXT: lui a2, 1048568 ; CHECK-NOV-NEXT: .LBB12_9: # %entry ; CHECK-NOV-NEXT: sh a4, 0(a0) ; CHECK-NOV-NEXT: sh a3, 2(a0) -; CHECK-NOV-NEXT: sh a2, 4(a0) -; CHECK-NOV-NEXT: sh a1, 6(a0) +; CHECK-NOV-NEXT: sh a1, 4(a0) +; CHECK-NOV-NEXT: sh a2, 6(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB12_10: # %entry -; CHECK-NOV-NEXT: mv a1, a5 +; CHECK-NOV-NEXT: mv a2, a5 ; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a5, .LBB12_2 +; CHECK-NOV-NEXT: blt a1, a5, .LBB12_2 ; CHECK-NOV-NEXT: .LBB12_11: # %entry -; CHECK-NOV-NEXT: mv a2, a5 +; CHECK-NOV-NEXT: mv a1, a5 ; CHECK-NOV-NEXT: fcvt.w.s a4, fa0, rtz ; CHECK-NOV-NEXT: blt a3, a5, .LBB12_3 ; CHECK-NOV-NEXT: .LBB12_12: # %entry @@ -1068,10 +1068,10 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) { ; CHECK-NOV-NEXT: blt a5, a3, .LBB12_6 ; CHECK-NOV-NEXT: .LBB12_15: # %entry ; CHECK-NOV-NEXT: lui a3, 1048568 -; CHECK-NOV-NEXT: blt a5, a2, .LBB12_7 +; CHECK-NOV-NEXT: blt a5, a1, .LBB12_7 ; CHECK-NOV-NEXT: .LBB12_16: # %entry -; CHECK-NOV-NEXT: lui a2, 1048568 -; CHECK-NOV-NEXT: bge a5, a1, .LBB12_8 +; CHECK-NOV-NEXT: lui a1, 1048568 +; CHECK-NOV-NEXT: bge a5, a2, .LBB12_8 ; CHECK-NOV-NEXT: j .LBB12_9 ; ; CHECK-V-LABEL: stest_f32i16: @@ -1094,14 +1094,14 @@ entry: define <4 x i16> @utest_f32i16(<4 x float> %x) { ; CHECK-NOV-LABEL: utest_f32i16: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.wu.s a1, fa0, rtz +; CHECK-NOV-NEXT: fcvt.wu.s a1, fa1, rtz +; CHECK-NOV-NEXT: fcvt.wu.s a2, fa0, rtz ; CHECK-NOV-NEXT: lui a3, 16 ; CHECK-NOV-NEXT: addiw a3, a3, -1 -; CHECK-NOV-NEXT: fcvt.wu.s a2, fa1, rtz -; CHECK-NOV-NEXT: bgeu a1, a3, .LBB13_6 +; CHECK-NOV-NEXT: bgeu a2, a3, .LBB13_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.wu.s a4, fa2, rtz -; CHECK-NOV-NEXT: bgeu a2, a3, .LBB13_7 +; CHECK-NOV-NEXT: bgeu a1, a3, .LBB13_7 ; CHECK-NOV-NEXT: .LBB13_2: # %entry ; CHECK-NOV-NEXT: fcvt.wu.s a5, fa3, rtz ; CHECK-NOV-NEXT: bgeu a4, a3, .LBB13_8 @@ -1110,17 +1110,17 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) { ; CHECK-NOV-NEXT: .LBB13_4: # %entry ; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB13_5: # %entry -; CHECK-NOV-NEXT: sh a1, 0(a0) -; CHECK-NOV-NEXT: sh a2, 2(a0) +; CHECK-NOV-NEXT: sh a2, 0(a0) +; CHECK-NOV-NEXT: sh a1, 2(a0) ; CHECK-NOV-NEXT: sh a4, 4(a0) ; CHECK-NOV-NEXT: sh a5, 6(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB13_6: # %entry -; CHECK-NOV-NEXT: mv a1, a3 +; CHECK-NOV-NEXT: mv a2, a3 ; CHECK-NOV-NEXT: fcvt.wu.s a4, fa2, rtz -; CHECK-NOV-NEXT: bltu a2, a3, .LBB13_2 +; CHECK-NOV-NEXT: bltu a1, a3, .LBB13_2 ; CHECK-NOV-NEXT: .LBB13_7: # %entry -; CHECK-NOV-NEXT: mv a2, a3 +; CHECK-NOV-NEXT: mv a1, a3 ; CHECK-NOV-NEXT: fcvt.wu.s a5, fa3, rtz ; CHECK-NOV-NEXT: bltu a4, a3, .LBB13_3 ; CHECK-NOV-NEXT: .LBB13_8: # %entry @@ -1146,10 +1146,10 @@ entry: define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i16: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.w.s a2, fa2, rtz ; CHECK-NOV-NEXT: fcvt.w.s a1, fa3, rtz ; CHECK-NOV-NEXT: lui a4, 16 ; CHECK-NOV-NEXT: addiw a4, a4, -1 -; CHECK-NOV-NEXT: fcvt.w.s a2, fa2, rtz ; CHECK-NOV-NEXT: bge a1, a4, .LBB14_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz @@ -1248,16 +1248,16 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 ; CHECK-NOV-NEXT: .cfi_remember_state +; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: lhu s1, 32(a1) ; CHECK-NOV-NEXT: lhu s2, 40(a1) -; CHECK-NOV-NEXT: lhu a2, 48(a1) +; CHECK-NOV-NEXT: lhu a0, 48(a1) ; CHECK-NOV-NEXT: lhu s3, 56(a1) ; CHECK-NOV-NEXT: lhu s4, 0(a1) ; CHECK-NOV-NEXT: lhu s5, 8(a1) ; CHECK-NOV-NEXT: lhu s6, 16(a1) ; CHECK-NOV-NEXT: lhu s7, 24(a1) -; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 @@ -1278,8 +1278,8 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-NOV-NEXT: lui a7, 8 @@ -1458,7 +1458,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -1466,6 +1465,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -1483,7 +1483,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -1491,6 +1490,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -1515,11 +1515,11 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -1534,11 +1534,11 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -1632,16 +1632,16 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 ; CHECK-NOV-NEXT: .cfi_remember_state +; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: lhu s1, 32(a1) ; CHECK-NOV-NEXT: lhu s2, 40(a1) ; CHECK-NOV-NEXT: lhu s3, 48(a1) ; CHECK-NOV-NEXT: lhu s4, 56(a1) ; CHECK-NOV-NEXT: lhu s5, 0(a1) -; CHECK-NOV-NEXT: lhu a2, 8(a1) +; CHECK-NOV-NEXT: lhu a0, 8(a1) ; CHECK-NOV-NEXT: lhu s6, 16(a1) ; CHECK-NOV-NEXT: lhu s7, 24(a1) -; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s6 @@ -1662,8 +1662,8 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: fcvt.lu.s s1, fs6, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-NOV-NEXT: lui a3, 16 @@ -1800,7 +1800,6 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -1808,6 +1807,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -1825,7 +1825,6 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -1833,6 +1832,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -1857,11 +1857,11 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -1876,11 +1876,11 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -1972,16 +1972,16 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 ; CHECK-NOV-NEXT: .cfi_remember_state +; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: lhu s1, 32(a1) ; CHECK-NOV-NEXT: lhu s2, 40(a1) -; CHECK-NOV-NEXT: lhu a2, 48(a1) +; CHECK-NOV-NEXT: lhu a0, 48(a1) ; CHECK-NOV-NEXT: lhu s3, 56(a1) ; CHECK-NOV-NEXT: lhu s4, 0(a1) ; CHECK-NOV-NEXT: lhu s5, 8(a1) ; CHECK-NOV-NEXT: lhu s6, 16(a1) ; CHECK-NOV-NEXT: lhu s7, 24(a1) -; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 @@ -2002,8 +2002,8 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-NOV-NEXT: lui a4, 16 @@ -2164,7 +2164,6 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -2172,6 +2171,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -2189,7 +2189,6 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -2197,6 +2196,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -2221,11 +2221,11 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -2240,11 +2240,11 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -3576,10 +3576,10 @@ entry: define <2 x i32> @stest_f64i32_mm(<2 x double> %x) { ; CHECK-NOV-LABEL: stest_f64i32_mm: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz ; CHECK-NOV-NEXT: fcvt.l.d a1, fa1, rtz ; CHECK-NOV-NEXT: lui a2, 524288 ; CHECK-NOV-NEXT: addiw a3, a2, -1 -; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz ; CHECK-NOV-NEXT: bge a1, a3, .LBB27_5 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: bge a0, a3, .LBB27_6 @@ -3620,10 +3620,10 @@ entry: define <2 x i32> @utest_f64i32_mm(<2 x double> %x) { ; CHECK-NOV-LABEL: utest_f64i32_mm: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.lu.d a1, fa1, rtz ; CHECK-NOV-NEXT: fcvt.lu.d a0, fa0, rtz ; CHECK-NOV-NEXT: li a2, -1 ; CHECK-NOV-NEXT: srli a2, a2, 32 -; CHECK-NOV-NEXT: fcvt.lu.d a1, fa1, rtz ; CHECK-NOV-NEXT: bgeu a0, a2, .LBB28_3 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: bgeu a1, a2, .LBB28_4 @@ -3653,10 +3653,10 @@ entry: define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-NOV-LABEL: ustest_f64i32_mm: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz ; CHECK-NOV-NEXT: fcvt.l.d a1, fa1, rtz ; CHECK-NOV-NEXT: li a2, -1 ; CHECK-NOV-NEXT: srli a2, a2, 32 -; CHECK-NOV-NEXT: fcvt.l.d a0, fa0, rtz ; CHECK-NOV-NEXT: blt a1, a2, .LBB29_2 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: mv a1, a2 @@ -3692,14 +3692,14 @@ entry: define <4 x i32> @stest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-LABEL: stest_f32i32_mm: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a1, fa3, rtz +; CHECK-NOV-NEXT: fcvt.l.s a1, fa2, rtz +; CHECK-NOV-NEXT: fcvt.l.s a2, fa3, rtz ; CHECK-NOV-NEXT: lui a3, 524288 ; CHECK-NOV-NEXT: addiw a6, a3, -1 -; CHECK-NOV-NEXT: fcvt.l.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a6, .LBB30_10 +; CHECK-NOV-NEXT: bge a2, a6, .LBB30_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a4, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a6, .LBB30_11 +; CHECK-NOV-NEXT: bge a1, a6, .LBB30_11 ; CHECK-NOV-NEXT: .LBB30_2: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz ; CHECK-NOV-NEXT: bge a4, a6, .LBB30_12 @@ -3710,23 +3710,23 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: .LBB30_5: # %entry ; CHECK-NOV-NEXT: bge a3, a4, .LBB30_15 ; CHECK-NOV-NEXT: .LBB30_6: # %entry -; CHECK-NOV-NEXT: bge a3, a2, .LBB30_16 +; CHECK-NOV-NEXT: bge a3, a1, .LBB30_16 ; CHECK-NOV-NEXT: .LBB30_7: # %entry -; CHECK-NOV-NEXT: blt a3, a1, .LBB30_9 +; CHECK-NOV-NEXT: blt a3, a2, .LBB30_9 ; CHECK-NOV-NEXT: .LBB30_8: # %entry -; CHECK-NOV-NEXT: lui a1, 524288 +; CHECK-NOV-NEXT: lui a2, 524288 ; CHECK-NOV-NEXT: .LBB30_9: # %entry ; CHECK-NOV-NEXT: sw a5, 0(a0) ; CHECK-NOV-NEXT: sw a4, 4(a0) -; CHECK-NOV-NEXT: sw a2, 8(a0) -; CHECK-NOV-NEXT: sw a1, 12(a0) +; CHECK-NOV-NEXT: sw a1, 8(a0) +; CHECK-NOV-NEXT: sw a2, 12(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB30_10: # %entry -; CHECK-NOV-NEXT: mv a1, a6 +; CHECK-NOV-NEXT: mv a2, a6 ; CHECK-NOV-NEXT: fcvt.l.s a4, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a6, .LBB30_2 +; CHECK-NOV-NEXT: blt a1, a6, .LBB30_2 ; CHECK-NOV-NEXT: .LBB30_11: # %entry -; CHECK-NOV-NEXT: mv a2, a6 +; CHECK-NOV-NEXT: mv a1, a6 ; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz ; CHECK-NOV-NEXT: blt a4, a6, .LBB30_3 ; CHECK-NOV-NEXT: .LBB30_12: # %entry @@ -3740,10 +3740,10 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: blt a3, a4, .LBB30_6 ; CHECK-NOV-NEXT: .LBB30_15: # %entry ; CHECK-NOV-NEXT: lui a4, 524288 -; CHECK-NOV-NEXT: blt a3, a2, .LBB30_7 +; CHECK-NOV-NEXT: blt a3, a1, .LBB30_7 ; CHECK-NOV-NEXT: .LBB30_16: # %entry -; CHECK-NOV-NEXT: lui a2, 524288 -; CHECK-NOV-NEXT: bge a3, a1, .LBB30_8 +; CHECK-NOV-NEXT: lui a1, 524288 +; CHECK-NOV-NEXT: bge a3, a2, .LBB30_8 ; CHECK-NOV-NEXT: j .LBB30_9 ; ; CHECK-V-LABEL: stest_f32i32_mm: @@ -3763,14 +3763,14 @@ entry: define <4 x i32> @utest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-LABEL: utest_f32i32_mm: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.lu.s a1, fa0, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a1, fa1, rtz +; CHECK-NOV-NEXT: fcvt.lu.s a2, fa0, rtz ; CHECK-NOV-NEXT: li a3, -1 ; CHECK-NOV-NEXT: srli a3, a3, 32 -; CHECK-NOV-NEXT: fcvt.lu.s a2, fa1, rtz -; CHECK-NOV-NEXT: bgeu a1, a3, .LBB31_6 +; CHECK-NOV-NEXT: bgeu a2, a3, .LBB31_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.lu.s a4, fa2, rtz -; CHECK-NOV-NEXT: bgeu a2, a3, .LBB31_7 +; CHECK-NOV-NEXT: bgeu a1, a3, .LBB31_7 ; CHECK-NOV-NEXT: .LBB31_2: # %entry ; CHECK-NOV-NEXT: fcvt.lu.s a5, fa3, rtz ; CHECK-NOV-NEXT: bgeu a4, a3, .LBB31_8 @@ -3779,17 +3779,17 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: .LBB31_4: # %entry ; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB31_5: # %entry -; CHECK-NOV-NEXT: sw a1, 0(a0) -; CHECK-NOV-NEXT: sw a2, 4(a0) +; CHECK-NOV-NEXT: sw a2, 0(a0) +; CHECK-NOV-NEXT: sw a1, 4(a0) ; CHECK-NOV-NEXT: sw a4, 8(a0) ; CHECK-NOV-NEXT: sw a5, 12(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB31_6: # %entry -; CHECK-NOV-NEXT: mv a1, a3 +; CHECK-NOV-NEXT: mv a2, a3 ; CHECK-NOV-NEXT: fcvt.lu.s a4, fa2, rtz -; CHECK-NOV-NEXT: bltu a2, a3, .LBB31_2 +; CHECK-NOV-NEXT: bltu a1, a3, .LBB31_2 ; CHECK-NOV-NEXT: .LBB31_7: # %entry -; CHECK-NOV-NEXT: mv a2, a3 +; CHECK-NOV-NEXT: mv a1, a3 ; CHECK-NOV-NEXT: fcvt.lu.s a5, fa3, rtz ; CHECK-NOV-NEXT: bltu a4, a3, .LBB31_3 ; CHECK-NOV-NEXT: .LBB31_8: # %entry @@ -3813,50 +3813,50 @@ entry: define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i32_mm: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a1, fa3, rtz -; CHECK-NOV-NEXT: li a3, -1 -; CHECK-NOV-NEXT: srli a3, a3, 32 ; CHECK-NOV-NEXT: fcvt.l.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a3, .LBB32_6 +; CHECK-NOV-NEXT: fcvt.l.s a1, fa3, rtz +; CHECK-NOV-NEXT: li a4, -1 +; CHECK-NOV-NEXT: srli a4, a4, 32 +; CHECK-NOV-NEXT: bge a1, a4, .LBB32_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a4, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a3, .LBB32_7 +; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz +; CHECK-NOV-NEXT: bge a2, a4, .LBB32_7 ; CHECK-NOV-NEXT: .LBB32_2: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NOV-NEXT: bge a4, a3, .LBB32_8 +; CHECK-NOV-NEXT: bge a3, a4, .LBB32_8 ; CHECK-NOV-NEXT: .LBB32_3: # %entry -; CHECK-NOV-NEXT: blt a5, a3, .LBB32_5 +; CHECK-NOV-NEXT: blt a5, a4, .LBB32_5 ; CHECK-NOV-NEXT: .LBB32_4: # %entry -; CHECK-NOV-NEXT: mv a5, a3 +; CHECK-NOV-NEXT: mv a5, a4 ; CHECK-NOV-NEXT: .LBB32_5: # %entry -; CHECK-NOV-NEXT: sgtz a3, a5 -; CHECK-NOV-NEXT: negw a3, a3 -; CHECK-NOV-NEXT: and a3, a3, a5 -; CHECK-NOV-NEXT: sgtz a5, a4 +; CHECK-NOV-NEXT: sgtz a4, a5 +; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: and a4, a4, a5 +; CHECK-NOV-NEXT: sgtz a5, a3 ; CHECK-NOV-NEXT: negw a5, a5 -; CHECK-NOV-NEXT: and a4, a5, a4 +; CHECK-NOV-NEXT: and a3, a5, a3 ; CHECK-NOV-NEXT: sgtz a5, a2 ; CHECK-NOV-NEXT: negw a5, a5 ; CHECK-NOV-NEXT: and a2, a5, a2 ; CHECK-NOV-NEXT: sgtz a5, a1 ; CHECK-NOV-NEXT: negw a5, a5 ; CHECK-NOV-NEXT: and a1, a5, a1 -; CHECK-NOV-NEXT: sw a3, 0(a0) -; CHECK-NOV-NEXT: sw a4, 4(a0) +; CHECK-NOV-NEXT: sw a4, 0(a0) +; CHECK-NOV-NEXT: sw a3, 4(a0) ; CHECK-NOV-NEXT: sw a2, 8(a0) ; CHECK-NOV-NEXT: sw a1, 12(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB32_6: # %entry -; CHECK-NOV-NEXT: mv a1, a3 -; CHECK-NOV-NEXT: fcvt.l.s a4, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a3, .LBB32_2 +; CHECK-NOV-NEXT: mv a1, a4 +; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz +; CHECK-NOV-NEXT: blt a2, a4, .LBB32_2 ; CHECK-NOV-NEXT: .LBB32_7: # %entry -; CHECK-NOV-NEXT: mv a2, a3 +; CHECK-NOV-NEXT: mv a2, a4 ; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NOV-NEXT: blt a4, a3, .LBB32_3 +; CHECK-NOV-NEXT: blt a3, a4, .LBB32_3 ; CHECK-NOV-NEXT: .LBB32_8: # %entry -; CHECK-NOV-NEXT: mv a4, a3 -; CHECK-NOV-NEXT: bge a5, a3, .LBB32_4 +; CHECK-NOV-NEXT: mv a3, a4 +; CHECK-NOV-NEXT: bge a5, a4, .LBB32_4 ; CHECK-NOV-NEXT: j .LBB32_5 ; ; CHECK-V-LABEL: ustest_f32i32_mm: @@ -3898,12 +3898,12 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 ; CHECK-NOV-NEXT: .cfi_remember_state +; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: lhu s1, 0(a1) ; CHECK-NOV-NEXT: lhu s2, 8(a1) -; CHECK-NOV-NEXT: lhu a2, 16(a1) +; CHECK-NOV-NEXT: lhu a0, 16(a1) ; CHECK-NOV-NEXT: lhu s3, 24(a1) -; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 @@ -3912,8 +3912,8 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-NOV-NEXT: lui a1, 524288 @@ -4011,11 +4011,11 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -4030,11 +4030,11 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -4096,22 +4096,22 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 ; CHECK-NOV-NEXT: .cfi_remember_state -; CHECK-NOV-NEXT: lhu s1, 0(a1) -; CHECK-NOV-NEXT: lhu a2, 8(a1) -; CHECK-NOV-NEXT: lhu s2, 16(a1) -; CHECK-NOV-NEXT: lhu s3, 24(a1) ; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: lhu s2, 0(a1) +; CHECK-NOV-NEXT: lhu a0, 8(a1) +; CHECK-NOV-NEXT: lhu s1, 16(a1) +; CHECK-NOV-NEXT: lhu s3, 24(a1) +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs2, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s2 +; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs1, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: fcvt.lu.s s1, fs2, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s2 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-NOV-NEXT: li a1, -1 @@ -4189,11 +4189,11 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -4208,11 +4208,11 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -4273,12 +4273,12 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs1, -56 ; CHECK-NOV-NEXT: .cfi_offset fs2, -64 ; CHECK-NOV-NEXT: .cfi_remember_state +; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: lhu s1, 0(a1) ; CHECK-NOV-NEXT: lhu s2, 8(a1) -; CHECK-NOV-NEXT: lhu a2, 16(a1) +; CHECK-NOV-NEXT: lhu a0, 16(a1) ; CHECK-NOV-NEXT: lhu s3, 24(a1) -; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs2, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 @@ -4287,8 +4287,8 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s1 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs2, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-NOV-NEXT: li a2, -1 @@ -4378,11 +4378,11 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -4397,11 +4397,11 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma @@ -4447,10 +4447,10 @@ entry: define <2 x i16> @stest_f64i16_mm(<2 x double> %x) { ; CHECK-NOV-LABEL: stest_f64i16_mm: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz ; CHECK-NOV-NEXT: fcvt.w.d a1, fa1, rtz ; CHECK-NOV-NEXT: lui a2, 8 ; CHECK-NOV-NEXT: addiw a2, a2, -1 -; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz ; CHECK-NOV-NEXT: bge a1, a2, .LBB36_5 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: bge a0, a2, .LBB36_6 @@ -4493,10 +4493,10 @@ entry: define <2 x i16> @utest_f64i16_mm(<2 x double> %x) { ; CHECK-NOV-LABEL: utest_f64i16_mm: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.wu.d a1, fa1, rtz ; CHECK-NOV-NEXT: fcvt.wu.d a0, fa0, rtz ; CHECK-NOV-NEXT: lui a2, 16 ; CHECK-NOV-NEXT: addiw a2, a2, -1 -; CHECK-NOV-NEXT: fcvt.wu.d a1, fa1, rtz ; CHECK-NOV-NEXT: bgeu a0, a2, .LBB37_3 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: bgeu a1, a2, .LBB37_4 @@ -4526,10 +4526,10 @@ entry: define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) { ; CHECK-NOV-LABEL: ustest_f64i16_mm: ; CHECK-NOV: # %bb.0: # %entry +; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz ; CHECK-NOV-NEXT: fcvt.w.d a1, fa1, rtz ; CHECK-NOV-NEXT: lui a2, 16 ; CHECK-NOV-NEXT: addiw a2, a2, -1 -; CHECK-NOV-NEXT: fcvt.w.d a0, fa0, rtz ; CHECK-NOV-NEXT: blt a1, a2, .LBB38_2 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: mv a1, a2 @@ -4565,14 +4565,14 @@ entry: define <4 x i16> @stest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-LABEL: stest_f32i16_mm: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.w.s a1, fa3, rtz +; CHECK-NOV-NEXT: fcvt.w.s a1, fa2, rtz +; CHECK-NOV-NEXT: fcvt.w.s a2, fa3, rtz ; CHECK-NOV-NEXT: lui a5, 8 ; CHECK-NOV-NEXT: addiw a5, a5, -1 -; CHECK-NOV-NEXT: fcvt.w.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a5, .LBB39_10 +; CHECK-NOV-NEXT: bge a2, a5, .LBB39_10 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a5, .LBB39_11 +; CHECK-NOV-NEXT: bge a1, a5, .LBB39_11 ; CHECK-NOV-NEXT: .LBB39_2: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a4, fa0, rtz ; CHECK-NOV-NEXT: bge a3, a5, .LBB39_12 @@ -4584,23 +4584,23 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: .LBB39_5: # %entry ; CHECK-NOV-NEXT: bge a5, a3, .LBB39_15 ; CHECK-NOV-NEXT: .LBB39_6: # %entry -; CHECK-NOV-NEXT: bge a5, a2, .LBB39_16 +; CHECK-NOV-NEXT: bge a5, a1, .LBB39_16 ; CHECK-NOV-NEXT: .LBB39_7: # %entry -; CHECK-NOV-NEXT: blt a5, a1, .LBB39_9 +; CHECK-NOV-NEXT: blt a5, a2, .LBB39_9 ; CHECK-NOV-NEXT: .LBB39_8: # %entry -; CHECK-NOV-NEXT: lui a1, 1048568 +; CHECK-NOV-NEXT: lui a2, 1048568 ; CHECK-NOV-NEXT: .LBB39_9: # %entry ; CHECK-NOV-NEXT: sh a4, 0(a0) ; CHECK-NOV-NEXT: sh a3, 2(a0) -; CHECK-NOV-NEXT: sh a2, 4(a0) -; CHECK-NOV-NEXT: sh a1, 6(a0) +; CHECK-NOV-NEXT: sh a1, 4(a0) +; CHECK-NOV-NEXT: sh a2, 6(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB39_10: # %entry -; CHECK-NOV-NEXT: mv a1, a5 +; CHECK-NOV-NEXT: mv a2, a5 ; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a5, .LBB39_2 +; CHECK-NOV-NEXT: blt a1, a5, .LBB39_2 ; CHECK-NOV-NEXT: .LBB39_11: # %entry -; CHECK-NOV-NEXT: mv a2, a5 +; CHECK-NOV-NEXT: mv a1, a5 ; CHECK-NOV-NEXT: fcvt.w.s a4, fa0, rtz ; CHECK-NOV-NEXT: blt a3, a5, .LBB39_3 ; CHECK-NOV-NEXT: .LBB39_12: # %entry @@ -4615,10 +4615,10 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: blt a5, a3, .LBB39_6 ; CHECK-NOV-NEXT: .LBB39_15: # %entry ; CHECK-NOV-NEXT: lui a3, 1048568 -; CHECK-NOV-NEXT: blt a5, a2, .LBB39_7 +; CHECK-NOV-NEXT: blt a5, a1, .LBB39_7 ; CHECK-NOV-NEXT: .LBB39_16: # %entry -; CHECK-NOV-NEXT: lui a2, 1048568 -; CHECK-NOV-NEXT: bge a5, a1, .LBB39_8 +; CHECK-NOV-NEXT: lui a1, 1048568 +; CHECK-NOV-NEXT: bge a5, a2, .LBB39_8 ; CHECK-NOV-NEXT: j .LBB39_9 ; ; CHECK-V-LABEL: stest_f32i16_mm: @@ -4639,14 +4639,14 @@ entry: define <4 x i16> @utest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-LABEL: utest_f32i16_mm: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.wu.s a1, fa0, rtz +; CHECK-NOV-NEXT: fcvt.wu.s a1, fa1, rtz +; CHECK-NOV-NEXT: fcvt.wu.s a2, fa0, rtz ; CHECK-NOV-NEXT: lui a3, 16 ; CHECK-NOV-NEXT: addiw a3, a3, -1 -; CHECK-NOV-NEXT: fcvt.wu.s a2, fa1, rtz -; CHECK-NOV-NEXT: bgeu a1, a3, .LBB40_6 +; CHECK-NOV-NEXT: bgeu a2, a3, .LBB40_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry ; CHECK-NOV-NEXT: fcvt.wu.s a4, fa2, rtz -; CHECK-NOV-NEXT: bgeu a2, a3, .LBB40_7 +; CHECK-NOV-NEXT: bgeu a1, a3, .LBB40_7 ; CHECK-NOV-NEXT: .LBB40_2: # %entry ; CHECK-NOV-NEXT: fcvt.wu.s a5, fa3, rtz ; CHECK-NOV-NEXT: bgeu a4, a3, .LBB40_8 @@ -4655,17 +4655,17 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: .LBB40_4: # %entry ; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB40_5: # %entry -; CHECK-NOV-NEXT: sh a1, 0(a0) -; CHECK-NOV-NEXT: sh a2, 2(a0) +; CHECK-NOV-NEXT: sh a2, 0(a0) +; CHECK-NOV-NEXT: sh a1, 2(a0) ; CHECK-NOV-NEXT: sh a4, 4(a0) ; CHECK-NOV-NEXT: sh a5, 6(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB40_6: # %entry -; CHECK-NOV-NEXT: mv a1, a3 +; CHECK-NOV-NEXT: mv a2, a3 ; CHECK-NOV-NEXT: fcvt.wu.s a4, fa2, rtz -; CHECK-NOV-NEXT: bltu a2, a3, .LBB40_2 +; CHECK-NOV-NEXT: bltu a1, a3, .LBB40_2 ; CHECK-NOV-NEXT: .LBB40_7: # %entry -; CHECK-NOV-NEXT: mv a2, a3 +; CHECK-NOV-NEXT: mv a1, a3 ; CHECK-NOV-NEXT: fcvt.wu.s a5, fa3, rtz ; CHECK-NOV-NEXT: bltu a4, a3, .LBB40_3 ; CHECK-NOV-NEXT: .LBB40_8: # %entry @@ -4690,50 +4690,50 @@ entry: define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i16_mm: ; CHECK-NOV: # %bb.0: # %entry -; CHECK-NOV-NEXT: fcvt.w.s a1, fa3, rtz -; CHECK-NOV-NEXT: lui a3, 16 -; CHECK-NOV-NEXT: addiw a3, a3, -1 ; CHECK-NOV-NEXT: fcvt.w.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a3, .LBB41_6 +; CHECK-NOV-NEXT: fcvt.w.s a1, fa3, rtz +; CHECK-NOV-NEXT: lui a4, 16 +; CHECK-NOV-NEXT: addiw a4, a4, -1 +; CHECK-NOV-NEXT: bge a1, a4, .LBB41_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.w.s a4, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a3, .LBB41_7 +; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz +; CHECK-NOV-NEXT: bge a2, a4, .LBB41_7 ; CHECK-NOV-NEXT: .LBB41_2: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz -; CHECK-NOV-NEXT: bge a4, a3, .LBB41_8 +; CHECK-NOV-NEXT: bge a3, a4, .LBB41_8 ; CHECK-NOV-NEXT: .LBB41_3: # %entry -; CHECK-NOV-NEXT: blt a5, a3, .LBB41_5 +; CHECK-NOV-NEXT: blt a5, a4, .LBB41_5 ; CHECK-NOV-NEXT: .LBB41_4: # %entry -; CHECK-NOV-NEXT: mv a5, a3 +; CHECK-NOV-NEXT: mv a5, a4 ; CHECK-NOV-NEXT: .LBB41_5: # %entry -; CHECK-NOV-NEXT: sgtz a3, a5 -; CHECK-NOV-NEXT: negw a3, a3 -; CHECK-NOV-NEXT: and a3, a3, a5 -; CHECK-NOV-NEXT: sgtz a5, a4 +; CHECK-NOV-NEXT: sgtz a4, a5 +; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: and a4, a4, a5 +; CHECK-NOV-NEXT: sgtz a5, a3 ; CHECK-NOV-NEXT: negw a5, a5 -; CHECK-NOV-NEXT: and a4, a5, a4 +; CHECK-NOV-NEXT: and a3, a5, a3 ; CHECK-NOV-NEXT: sgtz a5, a2 ; CHECK-NOV-NEXT: negw a5, a5 ; CHECK-NOV-NEXT: and a2, a5, a2 ; CHECK-NOV-NEXT: sgtz a5, a1 ; CHECK-NOV-NEXT: negw a5, a5 ; CHECK-NOV-NEXT: and a1, a5, a1 -; CHECK-NOV-NEXT: sh a3, 0(a0) -; CHECK-NOV-NEXT: sh a4, 2(a0) +; CHECK-NOV-NEXT: sh a4, 0(a0) +; CHECK-NOV-NEXT: sh a3, 2(a0) ; CHECK-NOV-NEXT: sh a2, 4(a0) ; CHECK-NOV-NEXT: sh a1, 6(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB41_6: # %entry -; CHECK-NOV-NEXT: mv a1, a3 -; CHECK-NOV-NEXT: fcvt.w.s a4, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a3, .LBB41_2 +; CHECK-NOV-NEXT: mv a1, a4 +; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz +; CHECK-NOV-NEXT: blt a2, a4, .LBB41_2 ; CHECK-NOV-NEXT: .LBB41_7: # %entry -; CHECK-NOV-NEXT: mv a2, a3 +; CHECK-NOV-NEXT: mv a2, a4 ; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz -; CHECK-NOV-NEXT: blt a4, a3, .LBB41_3 +; CHECK-NOV-NEXT: blt a3, a4, .LBB41_3 ; CHECK-NOV-NEXT: .LBB41_8: # %entry -; CHECK-NOV-NEXT: mv a4, a3 -; CHECK-NOV-NEXT: bge a5, a3, .LBB41_4 +; CHECK-NOV-NEXT: mv a3, a4 +; CHECK-NOV-NEXT: bge a5, a4, .LBB41_4 ; CHECK-NOV-NEXT: j .LBB41_5 ; ; CHECK-V-LABEL: ustest_f32i16_mm: @@ -4790,16 +4790,16 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 ; CHECK-NOV-NEXT: .cfi_remember_state +; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: lhu s1, 32(a1) ; CHECK-NOV-NEXT: lhu s2, 40(a1) -; CHECK-NOV-NEXT: lhu a2, 48(a1) +; CHECK-NOV-NEXT: lhu a0, 48(a1) ; CHECK-NOV-NEXT: lhu s3, 56(a1) ; CHECK-NOV-NEXT: lhu s4, 0(a1) ; CHECK-NOV-NEXT: lhu s5, 8(a1) ; CHECK-NOV-NEXT: lhu s6, 16(a1) ; CHECK-NOV-NEXT: lhu s7, 24(a1) -; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 @@ -4820,8 +4820,8 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-NOV-NEXT: lui a7, 8 @@ -5000,7 +5000,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -5008,6 +5007,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5025,7 +5025,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -5033,6 +5032,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5057,11 +5057,11 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5076,11 +5076,11 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5172,16 +5172,16 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 ; CHECK-NOV-NEXT: .cfi_remember_state +; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: lhu s1, 32(a1) ; CHECK-NOV-NEXT: lhu s2, 40(a1) ; CHECK-NOV-NEXT: lhu s3, 48(a1) ; CHECK-NOV-NEXT: lhu s4, 56(a1) ; CHECK-NOV-NEXT: lhu s5, 0(a1) -; CHECK-NOV-NEXT: lhu a2, 8(a1) +; CHECK-NOV-NEXT: lhu a0, 8(a1) ; CHECK-NOV-NEXT: lhu s6, 16(a1) ; CHECK-NOV-NEXT: lhu s7, 24(a1) -; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s6 @@ -5202,8 +5202,8 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: fcvt.lu.s s1, fs6, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s5 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-NOV-NEXT: lui a3, 16 @@ -5340,7 +5340,6 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -5348,6 +5347,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5365,7 +5365,6 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -5373,6 +5372,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5397,11 +5397,11 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5416,11 +5416,11 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5511,16 +5511,16 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: .cfi_offset fs5, -120 ; CHECK-NOV-NEXT: .cfi_offset fs6, -128 ; CHECK-NOV-NEXT: .cfi_remember_state +; CHECK-NOV-NEXT: mv s0, a0 ; CHECK-NOV-NEXT: lhu s1, 32(a1) ; CHECK-NOV-NEXT: lhu s2, 40(a1) -; CHECK-NOV-NEXT: lhu a2, 48(a1) +; CHECK-NOV-NEXT: lhu a0, 48(a1) ; CHECK-NOV-NEXT: lhu s3, 56(a1) ; CHECK-NOV-NEXT: lhu s4, 0(a1) ; CHECK-NOV-NEXT: lhu s5, 8(a1) ; CHECK-NOV-NEXT: lhu s6, 16(a1) ; CHECK-NOV-NEXT: lhu s7, 24(a1) -; CHECK-NOV-NEXT: mv s0, a0 -; CHECK-NOV-NEXT: fmv.w.x fa0, a2 +; CHECK-NOV-NEXT: fmv.w.x fa0, a0 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs6, fa0 ; CHECK-NOV-NEXT: fmv.w.x fa0, s2 @@ -5541,8 +5541,8 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: fmv.w.x fa0, s4 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fmv.s fs0, fa0 -; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: fcvt.l.s s1, fs6, rtz +; CHECK-NOV-NEXT: fmv.w.x fa0, s3 ; CHECK-NOV-NEXT: call __extendhfsf2 ; CHECK-NOV-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-NOV-NEXT: lui a3, 16 @@ -5703,7 +5703,6 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -5711,6 +5710,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s6 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5728,7 +5728,6 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -5736,6 +5735,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s4 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5760,11 +5760,11 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -5779,11 +5779,11 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll index 195ffc50594c3..9e8cd85739183 100644 --- a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll @@ -466,8 +466,8 @@ define @test5( %0, ; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: srl a0, a0, a2 ; CHECK-NEXT: andi a0, a0, 7 -; CHECK-NEXT: vfadd.vv v8, v8, v8 ; CHECK-NEXT: sw a0, 0(a1) +; CHECK-NEXT: vfadd.vv v8, v8, v8 ; CHECK-NEXT: ret ; ; UNOPT-LABEL: test5: @@ -482,8 +482,8 @@ define @test5( %0, ; UNOPT-NEXT: slli a2, a2, 2 ; UNOPT-NEXT: srl a0, a0, a2 ; UNOPT-NEXT: andi a0, a0, 7 -; UNOPT-NEXT: vfadd.vv v8, v8, v8 ; UNOPT-NEXT: sw a0, 0(a1) +; UNOPT-NEXT: vfadd.vv v8, v8, v8 ; UNOPT-NEXT: ret entry: %a = call @llvm.riscv.vfadd.nxv1f32.nxv1f32( diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll index 3d992aa13e379..15ba3850de23d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll @@ -12,11 +12,11 @@ define @round_nxv1f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -35,11 +35,11 @@ define @round_nxv2f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -58,11 +58,11 @@ define @round_nxv4f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -81,11 +81,11 @@ define @round_nxv8f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -104,11 +104,11 @@ define @round_nxv16f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -127,11 +127,11 @@ define @round_nxv32f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -152,9 +152,9 @@ define @round_nxv1f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -175,9 +175,9 @@ define @round_nxv2f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -198,9 +198,9 @@ define @round_nxv4f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -221,9 +221,9 @@ define @round_nxv8f32( %x) strictfp { ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -244,9 +244,9 @@ define @round_nxv16f32( %x) strictfp ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -265,11 +265,11 @@ define @round_nxv1f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -288,11 +288,11 @@ define @round_nxv2f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -311,11 +311,11 @@ define @round_nxv4f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -334,11 +334,11 @@ define @round_nxv8f64( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll index f7422b279149f..323a22a89bf7b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll @@ -20,11 +20,11 @@ define @round_nxv1bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -43,11 +43,11 @@ define @round_nxv2bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -66,11 +66,11 @@ define @round_nxv4bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -89,11 +89,11 @@ define @round_nxv8bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -112,11 +112,11 @@ define @round_nxv16bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -135,11 +135,11 @@ define @round_nxv32bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -169,12 +169,12 @@ define @round_nxv32bf16( %x) { define @round_nxv1f16( %x) { ; ZVFH-LABEL: round_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -187,11 +187,11 @@ define @round_nxv1f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -208,12 +208,12 @@ declare @llvm.round.nxv1f16() define @round_nxv2f16( %x) { ; ZVFH-LABEL: round_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -226,11 +226,11 @@ define @round_nxv2f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -247,12 +247,12 @@ declare @llvm.round.nxv2f16() define @round_nxv4f16( %x) { ; ZVFH-LABEL: round_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -265,11 +265,11 @@ define @round_nxv4f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -286,12 +286,12 @@ declare @llvm.round.nxv4f16() define @round_nxv8f16( %x) { ; ZVFH-LABEL: round_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -304,11 +304,11 @@ define @round_nxv8f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -325,12 +325,12 @@ declare @llvm.round.nxv8f16() define @round_nxv16f16( %x) { ; ZVFH-LABEL: round_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -343,11 +343,11 @@ define @round_nxv16f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -364,12 +364,12 @@ declare @llvm.round.nxv16f16() define @round_nxv32f16( %x) { ; ZVFH-LABEL: round_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -382,11 +382,11 @@ define @round_nxv32f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -421,8 +421,8 @@ define @round_nxv1f32( %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -441,8 +441,8 @@ define @round_nxv2f32( %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -461,8 +461,8 @@ define @round_nxv4f32( %x) { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -481,8 +481,8 @@ define @round_nxv8f32( %x) { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -501,8 +501,8 @@ define @round_nxv16f32( %x) { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -517,12 +517,12 @@ declare @llvm.round.nxv16f32() define @round_nxv1f64( %x) { ; CHECK-LABEL: round_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -537,12 +537,12 @@ declare @llvm.round.nxv1f64() define @round_nxv2f64( %x) { ; CHECK-LABEL: round_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -557,12 +557,12 @@ declare @llvm.round.nxv2f64() define @round_nxv4f64( %x) { ; CHECK-LABEL: round_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -577,12 +577,12 @@ declare @llvm.round.nxv4f64() define @round_nxv8f64( %x) { ; CHECK-LABEL: round_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll index c293ac91b63bf..6cd6eef99a9ec 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll @@ -12,11 +12,11 @@ define @roundeven_nxv1f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -35,11 +35,11 @@ define @roundeven_nxv2f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -58,11 +58,11 @@ define @roundeven_nxv4f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -81,11 +81,11 @@ define @roundeven_nxv8f16( %x) strictfp { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -104,11 +104,11 @@ define @roundeven_nxv16f16( %x) strictf ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -127,11 +127,11 @@ define @roundeven_nxv32f16( %x) strictf ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -152,9 +152,9 @@ define @roundeven_nxv1f32( %x) strictfp ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -175,9 +175,9 @@ define @roundeven_nxv2f32( %x) strictfp ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -198,9 +198,9 @@ define @roundeven_nxv4f32( %x) strictfp ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -221,9 +221,9 @@ define @roundeven_nxv8f32( %x) strictfp ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -244,9 +244,9 @@ define @roundeven_nxv16f32( %x) stric ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -265,11 +265,11 @@ define @roundeven_nxv1f64( %x) strict ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -288,11 +288,11 @@ define @roundeven_nxv2f64( %x) strict ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -311,11 +311,11 @@ define @roundeven_nxv4f64( %x) strict ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -334,11 +334,11 @@ define @roundeven_nxv8f64( %x) strict ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll index 865531b77eb29..903345dca1af2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll @@ -19,11 +19,11 @@ define @roundeven_nxv1bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -42,11 +42,11 @@ define @roundeven_nxv2bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -65,11 +65,11 @@ define @roundeven_nxv4bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -88,11 +88,11 @@ define @roundeven_nxv8bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -111,11 +111,11 @@ define @roundeven_nxv16bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v8, v16 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -134,11 +134,11 @@ define @roundeven_nxv32bf16( %x) { ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -168,12 +168,12 @@ define @roundeven_nxv32bf16( %x) { define @roundeven_nxv1f16( %x) { ; ZVFH-LABEL: roundeven_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -186,11 +186,11 @@ define @roundeven_nxv1f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -207,12 +207,12 @@ declare @llvm.roundeven.nxv1f16() define @roundeven_nxv2f16( %x) { ; ZVFH-LABEL: roundeven_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -225,11 +225,11 @@ define @roundeven_nxv2f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -246,12 +246,12 @@ declare @llvm.roundeven.nxv2f16() define @roundeven_nxv4f16( %x) { ; ZVFH-LABEL: roundeven_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -264,11 +264,11 @@ define @roundeven_nxv4f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -285,12 +285,12 @@ declare @llvm.roundeven.nxv4f16() define @roundeven_nxv8f16( %x) { ; ZVFH-LABEL: roundeven_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -303,11 +303,11 @@ define @roundeven_nxv8f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -324,12 +324,12 @@ declare @llvm.roundeven.nxv8f16() define @roundeven_nxv16f16( %x) { ; ZVFH-LABEL: roundeven_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -342,11 +342,11 @@ define @roundeven_nxv16f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -363,12 +363,12 @@ declare @llvm.roundeven.nxv16f16() define @roundeven_nxv32f16( %x) { ; ZVFH-LABEL: roundeven_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -381,11 +381,11 @@ define @roundeven_nxv32f16( %x) { ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -420,8 +420,8 @@ define @roundeven_nxv1f32( %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -440,8 +440,8 @@ define @roundeven_nxv2f32( %x) { ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -460,8 +460,8 @@ define @roundeven_nxv4f32( %x) { ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -480,8 +480,8 @@ define @roundeven_nxv8f32( %x) { ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -500,8 +500,8 @@ define @roundeven_nxv16f32( %x) { ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -516,12 +516,12 @@ declare @llvm.roundeven.nxv16f32() define @roundeven_nxv1f64( %x) { ; CHECK-LABEL: roundeven_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI17_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -536,12 +536,12 @@ declare @llvm.roundeven.nxv1f64() define @roundeven_nxv2f64( %x) { ; CHECK-LABEL: roundeven_nxv2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -556,12 +556,12 @@ declare @llvm.roundeven.nxv2f64() define @roundeven_nxv4f64( %x) { ; CHECK-LABEL: roundeven_nxv4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -576,12 +576,12 @@ declare @llvm.roundeven.nxv4f64() define @roundeven_nxv8f64( %x) { ; CHECK-LABEL: roundeven_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll index b569efc7447da..f52200b4e7c34 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll @@ -984,10 +984,10 @@ define @fshr_v16i64( %a, @fshr_v16i64( %a, @fshr_v16i64( %a, @fshr_v16i64( %a, @fshr_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, @fshl_v16i64( %a, @ceil_nxv1f16_to_ui32( %x) { define @ceil_nxv1f16_to_si64( %x) { ; CHECK-LABEL: ceil_nxv1f16_to_si64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI22_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI22_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI22_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI22_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -388,12 +388,12 @@ define @ceil_nxv1f16_to_si64( %x) { define @ceil_nxv1f16_to_ui64( %x) { ; CHECK-LABEL: ceil_nxv1f16_to_ui64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI23_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI23_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI23_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI23_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -533,12 +533,12 @@ define @ceil_nxv4f16_to_ui32( %x) { define @ceil_nxv4f16_to_si64( %x) { ; CHECK-LABEL: ceil_nxv4f16_to_si64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI30_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI30_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI30_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI30_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -590,12 +590,12 @@ define @ceil_nxv4f16_to_si64( %x) { define @ceil_nxv4f16_to_ui64( %x) { ; CHECK-LABEL: ceil_nxv4f16_to_ui64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI31_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI31_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI31_0) +; CHECK-NEXT: flh fa5, %lo(.LCPI31_0)(a0) ; CHECK-NEXT: fsrmi a0, 3 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/implicit-def-copy.ll b/llvm/test/CodeGen/RISCV/rvv/implicit-def-copy.ll index 292f1deb2cce8..9475989d46343 100644 --- a/llvm/test/CodeGen/RISCV/rvv/implicit-def-copy.ll +++ b/llvm/test/CodeGen/RISCV/rvv/implicit-def-copy.ll @@ -11,8 +11,8 @@ define @vpload_nxv8i64(ptr %ptr, %m, i32 ze ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x11 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr = COPY $v0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x10 - ; CHECK-NEXT: $v0 = COPY [[COPY1]] - ; CHECK-NEXT: [[PseudoVLE64_V_M8_MASK:%[0-9]+]]:vrm8nov0 = PseudoVLE64_V_M8_MASK $noreg, [[COPY2]], $v0, [[COPY]], 6 /* e64 */, 1 /* ta, mu */ :: (load unknown-size from %ir.ptr, align 64) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vmv0 = COPY [[COPY1]] + ; CHECK-NEXT: [[PseudoVLE64_V_M8_MASK:%[0-9]+]]:vrm8nov0 = PseudoVLE64_V_M8_MASK $noreg, [[COPY2]], [[COPY3]], [[COPY]], 6 /* e64 */, 1 /* ta, mu */ :: (load unknown-size from %ir.ptr, align 64) ; CHECK-NEXT: $v8m8 = COPY [[PseudoVLE64_V_M8_MASK]] ; CHECK-NEXT: PseudoRET implicit $v8m8 %load = call @llvm.vp.load.nxv8i64.p0(ptr %ptr, %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/localvar.ll b/llvm/test/CodeGen/RISCV/rvv/localvar.ll index ad8fde013ce08..fb7cd0072efa9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/localvar.ll +++ b/llvm/test/CodeGen/RISCV/rvv/localvar.ll @@ -223,14 +223,14 @@ define void @local_var_m2_with_varsize_object(i64 %n) { ; RV64IV-NEXT: andi a0, a0, -16 ; RV64IV-NEXT: sub a0, sp, a0 ; RV64IV-NEXT: mv sp, a0 -; RV64IV-NEXT: csrr a1, vlenb -; RV64IV-NEXT: slli a1, a1, 1 -; RV64IV-NEXT: sub a1, s0, a1 -; RV64IV-NEXT: addi a1, a1, -32 ; RV64IV-NEXT: csrr s1, vlenb ; RV64IV-NEXT: slli s1, s1, 1 ; RV64IV-NEXT: sub s1, s0, s1 ; RV64IV-NEXT: addi s1, s1, -32 +; RV64IV-NEXT: csrr a1, vlenb +; RV64IV-NEXT: slli a1, a1, 1 +; RV64IV-NEXT: sub a1, s0, a1 +; RV64IV-NEXT: addi a1, a1, -32 ; RV64IV-NEXT: call notdead ; RV64IV-NEXT: vl2r.v v8, (s1) ; RV64IV-NEXT: csrr a0, vlenb @@ -282,15 +282,15 @@ define void @local_var_m2_with_bp(i64 %n) { ; RV64IV-NEXT: andi a0, a0, -16 ; RV64IV-NEXT: sub a0, sp, a0 ; RV64IV-NEXT: mv sp, a0 +; RV64IV-NEXT: csrr s2, vlenb +; RV64IV-NEXT: slli s2, s2, 1 +; RV64IV-NEXT: add s2, s1, s2 +; RV64IV-NEXT: addi s2, s2, 224 ; RV64IV-NEXT: addi a1, s1, 128 ; RV64IV-NEXT: csrr a2, vlenb ; RV64IV-NEXT: slli a2, a2, 1 ; RV64IV-NEXT: add a2, s1, a2 ; RV64IV-NEXT: addi a2, a2, 224 -; RV64IV-NEXT: csrr s2, vlenb -; RV64IV-NEXT: slli s2, s2, 1 -; RV64IV-NEXT: add s2, s1, s2 -; RV64IV-NEXT: addi s2, s2, 224 ; RV64IV-NEXT: call notdead2 ; RV64IV-NEXT: lw zero, 124(s1) ; RV64IV-NEXT: vl2r.v v8, (s2) diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir b/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir index 6fe228f44a1c8..2d49b4e4f493f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir +++ b/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir @@ -17,9 +17,9 @@ body: | ; CHECK: liveins: $v0, $v1, $v2, $v3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 1, 192 /* e8, m1, ta, ma */, implicit-def $vl, implicit-def $vtype - ; CHECK-NEXT: renamable $v8 = PseudoVMERGE_VIM_M1 undef renamable $v8, killed renamable $v2, 1, killed renamable $v0, 1, 3 /* e8 */, implicit $vl, implicit $vtype - ; CHECK-NEXT: renamable $v0 = COPY killed renamable $v1, implicit $vtype - ; CHECK-NEXT: renamable $v9 = PseudoVMERGE_VIM_M1 undef renamable $v9, killed renamable $v3, 1, killed renamable $v0, 1, 3 /* e8 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: renamable $v8 = PseudoVMERGE_VIM_M1 undef renamable $v8, killed renamable $v2, 1, $v0, 1, 3 /* e8 */, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v0 = COPY killed renamable $v1, implicit $vtype + ; CHECK-NEXT: renamable $v9 = PseudoVMERGE_VIM_M1 undef renamable $v9, killed renamable $v3, 1, $v0, 1, 3 /* e8 */, implicit $vl, implicit $vtype ; CHECK-NEXT: renamable $v0 = PseudoVADD_VV_M1 undef renamable $v0, killed renamable $v8, killed renamable $v9, 1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: PseudoRET implicit $v0 %0:vr = COPY $v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll index 2553f563b7d0f..85b04f177f66f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll +++ b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll @@ -137,12 +137,12 @@ define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: sb a2, 6(a0) ; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: addi a2, a0, 4 ; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: addi a1, a1, 4 ; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: vse8.v v8, (a2) ; RV32-NEXT: ret ; ; RV64-LABEL: unaligned_memcpy7: @@ -151,12 +151,12 @@ define void @unaligned_memcpy7(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: sb a2, 6(a0) ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: addi a2, a0, 4 ; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: addi a1, a1, 4 ; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: vse8.v v8, (a2) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: unaligned_memcpy7: @@ -223,11 +223,11 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; RV32-NEXT: vle8.v v8, (a2) ; RV32-NEXT: addi a2, a0, 12 +; RV32-NEXT: addi a0, a0, 8 ; RV32-NEXT: vse8.v v8, (a2) ; RV32-NEXT: addi a1, a1, 8 ; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: addi a0, a0, 8 ; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; @@ -242,11 +242,11 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; RV64-NEXT: vle8.v v8, (a2) ; RV64-NEXT: addi a2, a0, 12 +; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: vse8.v v8, (a2) ; RV64-NEXT: addi a1, a1, 8 ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: addi a0, a0, 8 ; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; @@ -312,9 +312,9 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-NEXT: vle8.v v8, (a1) ; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: addi a0, a0, 15 ; RV32-NEXT: addi a1, a1, 15 ; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: addi a0, a0, 15 ; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; @@ -323,9 +323,9 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64-NEXT: vle8.v v8, (a1) ; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: addi a0, a0, 15 ; RV64-NEXT: addi a1, a1, 15 ; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: addi a0, a0, 15 ; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; @@ -334,9 +334,9 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-FAST-NEXT: vle64.v v8, (a1) ; RV32-FAST-NEXT: vse64.v v8, (a0) +; RV32-FAST-NEXT: addi a0, a0, 15 ; RV32-FAST-NEXT: addi a1, a1, 15 ; RV32-FAST-NEXT: vle64.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 15 ; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; @@ -345,9 +345,9 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-FAST-NEXT: vle64.v v8, (a1) ; RV64-FAST-NEXT: vse64.v v8, (a0) +; RV64-FAST-NEXT: addi a0, a0, 15 ; RV64-FAST-NEXT: addi a1, a1, 15 ; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: addi a0, a0, 15 ; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret entry: @@ -459,10 +459,10 @@ define void @unaligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-FAST-NEXT: vle64.v v8, (a1) ; RV32-FAST-NEXT: vse64.v v8, (a0) +; RV32-FAST-NEXT: addi a0, a0, 64 ; RV32-FAST-NEXT: addi a1, a1, 64 ; RV32-FAST-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-FAST-NEXT: vle64.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 64 ; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; @@ -471,10 +471,10 @@ define void @unaligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind { ; RV64-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-FAST-NEXT: vle64.v v8, (a1) ; RV64-FAST-NEXT: vse64.v v8, (a0) +; RV64-FAST-NEXT: addi a0, a0, 64 ; RV64-FAST-NEXT: addi a1, a1, 64 ; RV64-FAST-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: addi a0, a0, 64 ; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret entry: @@ -568,12 +568,12 @@ define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-NEXT: sw a2, 192(a0) ; RV32-FAST-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: addi a2, a0, 128 ; RV32-FAST-NEXT: vse64.v v8, (a0) -; RV32-FAST-NEXT: addi a1, a1, 128 +; RV32-FAST-NEXT: addi a0, a1, 128 ; RV32-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-FAST-NEXT: vle64.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 128 -; RV32-FAST-NEXT: vse64.v v8, (a0) +; RV32-FAST-NEXT: vle64.v v8, (a0) +; RV32-FAST-NEXT: vse64.v v8, (a2) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy196: @@ -582,12 +582,12 @@ define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind { ; RV64-FAST-NEXT: sw a2, 192(a0) ; RV64-FAST-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-FAST-NEXT: vle64.v v8, (a1) +; RV64-FAST-NEXT: addi a2, a0, 128 ; RV64-FAST-NEXT: vse64.v v8, (a0) -; RV64-FAST-NEXT: addi a1, a1, 128 +; RV64-FAST-NEXT: addi a0, a1, 128 ; RV64-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: addi a0, a0, 128 -; RV64-FAST-NEXT: vse64.v v8, (a0) +; RV64-FAST-NEXT: vle64.v v8, (a0) +; RV64-FAST-NEXT: vse64.v v8, (a2) ; RV64-FAST-NEXT: ret entry: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 196, i1 false) @@ -624,9 +624,9 @@ define void @unaligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-FAST-NEXT: vle64.v v8, (a1) ; RV32-FAST-NEXT: vse64.v v8, (a0) +; RV32-FAST-NEXT: addi a0, a0, 128 ; RV32-FAST-NEXT: addi a1, a1, 128 ; RV32-FAST-NEXT: vle64.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 128 ; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; @@ -635,9 +635,9 @@ define void @unaligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind { ; RV64-FAST-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-FAST-NEXT: vle64.v v8, (a1) ; RV64-FAST-NEXT: vse64.v v8, (a0) +; RV64-FAST-NEXT: addi a0, a0, 128 ; RV64-FAST-NEXT: addi a1, a1, 128 ; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: addi a0, a0, 128 ; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret entry: @@ -837,10 +837,10 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a1) ; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: addi a0, a0, 15 ; RV32-NEXT: addi a1, a1, 15 ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: addi a0, a0, 15 ; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; @@ -849,10 +849,10 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle64.v v8, (a1) ; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: addi a0, a0, 15 ; RV64-NEXT: addi a1, a1, 15 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: addi a0, a0, 15 ; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; @@ -861,9 +861,9 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-FAST-NEXT: vle64.v v8, (a1) ; RV32-FAST-NEXT: vse64.v v8, (a0) +; RV32-FAST-NEXT: addi a0, a0, 15 ; RV32-FAST-NEXT: addi a1, a1, 15 ; RV32-FAST-NEXT: vle64.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 15 ; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; @@ -872,9 +872,9 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-FAST-NEXT: vle64.v v8, (a1) ; RV64-FAST-NEXT: vse64.v v8, (a0) +; RV64-FAST-NEXT: addi a0, a0, 15 ; RV64-FAST-NEXT: addi a1, a1, 15 ; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: addi a0, a0, 15 ; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret entry: @@ -926,10 +926,10 @@ define void @aligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-BOTH-NEXT: vle64.v v8, (a1) ; RV32-BOTH-NEXT: vse64.v v8, (a0) +; RV32-BOTH-NEXT: addi a0, a0, 64 ; RV32-BOTH-NEXT: addi a1, a1, 64 ; RV32-BOTH-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-BOTH-NEXT: vle64.v v8, (a1) -; RV32-BOTH-NEXT: addi a0, a0, 64 ; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; @@ -938,10 +938,10 @@ define void @aligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind { ; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-BOTH-NEXT: vle64.v v8, (a1) ; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: addi a0, a0, 64 ; RV64-BOTH-NEXT: addi a1, a1, 64 ; RV64-BOTH-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-BOTH-NEXT: vle64.v v8, (a1) -; RV64-BOTH-NEXT: addi a0, a0, 64 ; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret entry: @@ -975,12 +975,12 @@ define void @aligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-NEXT: sw a2, 192(a0) ; RV32-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-BOTH-NEXT: vle64.v v8, (a1) +; RV32-BOTH-NEXT: addi a2, a0, 128 ; RV32-BOTH-NEXT: vse64.v v8, (a0) -; RV32-BOTH-NEXT: addi a1, a1, 128 +; RV32-BOTH-NEXT: addi a0, a1, 128 ; RV32-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-BOTH-NEXT: vle64.v v8, (a1) -; RV32-BOTH-NEXT: addi a0, a0, 128 -; RV32-BOTH-NEXT: vse64.v v8, (a0) +; RV32-BOTH-NEXT: vle64.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a2) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memcpy196: @@ -989,12 +989,12 @@ define void @aligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind { ; RV64-BOTH-NEXT: sw a2, 192(a0) ; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: addi a2, a0, 128 ; RV64-BOTH-NEXT: vse64.v v8, (a0) -; RV64-BOTH-NEXT: addi a1, a1, 128 +; RV64-BOTH-NEXT: addi a0, a1, 128 ; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-BOTH-NEXT: vle64.v v8, (a1) -; RV64-BOTH-NEXT: addi a0, a0, 128 -; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: vle64.v v8, (a0) +; RV64-BOTH-NEXT: vse64.v v8, (a2) ; RV64-BOTH-NEXT: ret entry: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 196, i1 false) @@ -1007,9 +1007,9 @@ define void @aligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-BOTH-NEXT: vle64.v v8, (a1) ; RV32-BOTH-NEXT: vse64.v v8, (a0) +; RV32-BOTH-NEXT: addi a0, a0, 128 ; RV32-BOTH-NEXT: addi a1, a1, 128 ; RV32-BOTH-NEXT: vle64.v v8, (a1) -; RV32-BOTH-NEXT: addi a0, a0, 128 ; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; @@ -1018,9 +1018,9 @@ define void @aligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind { ; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-BOTH-NEXT: vle64.v v8, (a1) ; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: addi a0, a0, 128 ; RV64-BOTH-NEXT: addi a1, a1, 128 ; RV64-BOTH-NEXT: vle64.v v8, (a1) -; RV64-BOTH-NEXT: addi a0, a0, 128 ; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/memory-args.ll b/llvm/test/CodeGen/RISCV/rvv/memory-args.ll index 8190a82d7035b..f4502ee0fa8f4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/memory-args.ll +++ b/llvm/test/CodeGen/RISCV/rvv/memory-args.ll @@ -55,9 +55,9 @@ define @caller() { ; RV64IV-NEXT: add a0, sp, a0 ; RV64IV-NEXT: addi a0, a0, 64 ; RV64IV-NEXT: vl8r.v v24, (a0) -; RV64IV-NEXT: addi a1, sp, 64 ; RV64IV-NEXT: addi a0, sp, 64 -; RV64IV-NEXT: vs8r.v v24, (a1) +; RV64IV-NEXT: vs8r.v v24, (a0) +; RV64IV-NEXT: addi a0, sp, 64 ; RV64IV-NEXT: call callee ; RV64IV-NEXT: addi sp, s0, -80 ; RV64IV-NEXT: .cfi_def_cfa sp, 80 diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll index 0fad09f27007c..893658ebb1901 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -1221,12 +1221,12 @@ define void @mgather_nxv16i64( %ptrs0, %ptr ; RV32: # %bb.0: ; RV32-NEXT: vl8re64.v v24, (a0) ; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, mu +; RV32-NEXT: vluxei32.v v16, (zero), v8, v0.t ; RV32-NEXT: srli a2, a0, 3 ; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vx v7, v0, a2 +; RV32-NEXT: vslidedown.vx v0, v0, a2 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, mu -; RV32-NEXT: vluxei32.v v16, (zero), v8, v0.t -; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: vluxei32.v v24, (zero), v12, v0.t ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, a1, a0 @@ -1236,37 +1236,20 @@ define void @mgather_nxv16i64( %ptrs0, %ptr ; ; RV64-LABEL: mgather_nxv16i64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 3 -; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; RV64-NEXT: vmv8r.v v16, v8 ; RV64-NEXT: vl8re64.v v24, (a0) -; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v24, (zero), v8, v0.t ; RV64-NEXT: vl8re64.v v8, (a1) +; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: srli a1, a0, 3 -; RV64-NEXT: vslidedown.vx v7, v0, a1 +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vluxei64.v v24, (zero), v16, v0.t -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vluxei64.v v8, (zero), v16, v0.t ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, a2, a0 ; RV64-NEXT: vs8r.v v8, (a0) ; RV64-NEXT: vs8r.v v24, (a2) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 16 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %p0 = call @llvm.vector.insert.nxv8p0.nxv16p0( undef, %ptrs0, i64 0) %p1 = call @llvm.vector.insert.nxv8p0.nxv16p0( %p0, %ptrs1, i64 8) @@ -2347,12 +2330,12 @@ define @mgather_baseidx_nxv32i8(ptr %base, ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v12, (a0), v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vsext.vf8 v24, v9 ; RV64-NEXT: srli a2, a1, 3 ; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vx v0, v0, a2 -; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v24, v9 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vsetvli a3, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v13, (a0), v24, v0.t ; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll index 3cf7cc9cb5152..cd6f76a79373f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll @@ -1904,57 +1904,25 @@ define void @mscatter_nxv16f64( %val0, @reverse_nxv3i64( %a) { ; CHECK-LABEL: reverse_nxv3i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v14, v12, a0 -; CHECK-NEXT: vrgather.vv v13, v10, v14 -; CHECK-NEXT: vrgather.vv v10, v9, v14 -; CHECK-NEXT: vmv.v.v v12, v13 -; CHECK-NEXT: vrgather.vv v15, v8, v14 -; CHECK-NEXT: vmv.v.v v13, v10 -; CHECK-NEXT: vrgather.vv v8, v11, v14 -; CHECK-NEXT: vmv.v.v v14, v15 -; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vrsub.vx v12, v12, a0 +; CHECK-NEXT: vrgather.vv v15, v8, v12 +; CHECK-NEXT: vrgather.vv v14, v9, v12 +; CHECK-NEXT: vrgather.vv v9, v10, v12 +; CHECK-NEXT: vrgather.vv v8, v11, v12 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vmv.v.v v9, v14 +; CHECK-NEXT: vmv.v.v v10, v15 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv3i64( %a) ret %res @@ -1969,19 +1968,18 @@ define @reverse_nxv6i64( %a) { ; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vrsub.vx v22, v16, a0 -; CHECK-NEXT: vrgather.vv v21, v10, v22 -; CHECK-NEXT: vrgather.vv v19, v12, v22 -; CHECK-NEXT: vrgather.vv v18, v13, v22 -; CHECK-NEXT: vrgather.vv v20, v11, v22 -; CHECK-NEXT: vmv2r.v v16, v18 -; CHECK-NEXT: vmv2r.v v18, v20 -; CHECK-NEXT: vrgather.vv v31, v8, v22 -; CHECK-NEXT: vrgather.vv v30, v9, v22 -; CHECK-NEXT: vrgather.vv v9, v14, v22 -; CHECK-NEXT: vrgather.vv v8, v15, v22 -; CHECK-NEXT: vmv2r.v v20, v30 -; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vrsub.vx v16, v16, a0 +; CHECK-NEXT: vrgather.vv v23, v8, v16 +; CHECK-NEXT: vrgather.vv v21, v10, v16 +; CHECK-NEXT: vrgather.vv v22, v9, v16 +; CHECK-NEXT: vrgather.vv v20, v11, v16 +; CHECK-NEXT: vrgather.vv v11, v12, v16 +; CHECK-NEXT: vrgather.vv v10, v13, v16 +; CHECK-NEXT: vrgather.vv v9, v14, v16 +; CHECK-NEXT: vrgather.vv v8, v15, v16 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: vmv2r.v v12, v22 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv6i64( %a) ret %res diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll index 7d3700492ea7b..12042975b5adf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll @@ -22,22 +22,22 @@ define @vp_nearbyint_nxv1bf16( %va, < ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v11, v10, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v8, v11, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v11, v10, v0.t ; CHECK-NEXT: vfcvt.f.x.v v11, v11, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v10, v11, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv1bf16( %va, %m, i32 %evl) ret %v @@ -49,18 +49,18 @@ define @vp_nearbyint_nxv1bf16_unmasked( @llvm.vp.nearbyint.nxv1bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -76,22 +76,22 @@ define @vp_nearbyint_nxv2bf16( %va, < ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v11, v10, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v8, v11, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v11, v10, v0.t ; CHECK-NEXT: vfcvt.f.x.v v11, v11, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v10, v11, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv2bf16( %va, %m, i32 %evl) ret %v @@ -103,18 +103,18 @@ define @vp_nearbyint_nxv2bf16_unmasked( @llvm.vp.nearbyint.nxv2bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -130,22 +130,22 @@ define @vp_nearbyint_nxv4bf16( %va, < ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v10, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v8, v12, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv4bf16( %va, %m, i32 %evl) ret %v @@ -157,18 +157,18 @@ define @vp_nearbyint_nxv4bf16_unmasked( @llvm.vp.nearbyint.nxv4bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -184,22 +184,22 @@ define @vp_nearbyint_nxv8bf16( %va, < ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v12, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv8bf16( %va, %m, i32 %evl) ret %v @@ -211,18 +211,18 @@ define @vp_nearbyint_nxv8bf16_unmasked( @llvm.vp.nearbyint.nxv8bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -238,22 +238,22 @@ define @vp_nearbyint_nxv16bf16( %va ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v8, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv16bf16( %va, %m, i32 %evl) ret %v @@ -265,18 +265,18 @@ define @vp_nearbyint_nxv16bf16_unmasked( @llvm.vp.nearbyint.nxv16bf16( %va, splat (i1 true), i32 %evl) ret %v @@ -297,6 +297,7 @@ define @vp_nearbyint_nxv32bf16( %va ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: lui a3, 307200 +; CHECK-NEXT: frflags a4 ; CHECK-NEXT: slli a1, a2, 1 ; CHECK-NEXT: srli a2, a2, 2 ; CHECK-NEXT: fmv.w.x fa5, a3 @@ -315,12 +316,11 @@ define @vp_nearbyint_nxv32bf16( %va ; CHECK-NEXT: vfabs.v v8, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v18, v8, fa5, v0.t -; CHECK-NEXT: frflags a2 ; CHECK-NEXT: vmv1r.v v0, v18 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t -; CHECK-NEXT: fsflags a2 +; CHECK-NEXT: fsflags a4 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t ; CHECK-NEXT: vmv1r.v v0, v17 @@ -336,21 +336,21 @@ define @vp_nearbyint_nxv32bf16( %va ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t ; CHECK-NEXT: vmv1r.v v8, v7 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -375,11 +375,12 @@ define @vp_nearbyint_nxv32bf16_unmasked( @vp_nearbyint_nxv32bf16_unmasked( @vp_nearbyint_nxv32bf16_unmasked( @llvm.vp.nearbyint.nxv1f16(, @vp_nearbyint_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a0) +; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vp_nearbyint_nxv1f16: @@ -461,22 +461,22 @@ define @vp_nearbyint_nxv1f16( %va, @llvm.vp.nearbyint.nxv1f16( %va, %m, i32 %evl) ret %v @@ -485,17 +485,17 @@ define @vp_nearbyint_nxv1f16( %va, @vp_nearbyint_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vp_nearbyint_nxv1f16_unmasked: @@ -503,18 +503,18 @@ define @vp_nearbyint_nxv1f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv1f16( %va, splat (i1 true), i32 %evl) ret %v @@ -525,19 +525,19 @@ declare @llvm.vp.nearbyint.nxv2f16(, @vp_nearbyint_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a0) +; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vp_nearbyint_nxv2f16: @@ -547,22 +547,22 @@ define @vp_nearbyint_nxv2f16( %va, @llvm.vp.nearbyint.nxv2f16( %va, %m, i32 %evl) ret %v @@ -571,17 +571,17 @@ define @vp_nearbyint_nxv2f16( %va, @vp_nearbyint_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vp_nearbyint_nxv2f16_unmasked: @@ -589,18 +589,18 @@ define @vp_nearbyint_nxv2f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v9, v8, v9, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv2f16( %va, splat (i1 true), i32 %evl) ret %v @@ -611,19 +611,19 @@ declare @llvm.vp.nearbyint.nxv4f16(, @vp_nearbyint_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a0) +; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16: @@ -633,22 +633,22 @@ define @vp_nearbyint_nxv4f16( %va, @llvm.vp.nearbyint.nxv4f16( %va, %m, i32 %evl) ret %v @@ -657,17 +657,17 @@ define @vp_nearbyint_nxv4f16( %va, @vp_nearbyint_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16_unmasked: @@ -675,18 +675,18 @@ define @vp_nearbyint_nxv4f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v10, v8, v10, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v10 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv4f16( %va, splat (i1 true), i32 %evl) ret %v @@ -699,19 +699,19 @@ define @vp_nearbyint_nxv8f16( %va, @vp_nearbyint_nxv8f16( %va, @llvm.vp.nearbyint.nxv8f16( %va, %m, i32 %evl) ret %v @@ -745,17 +745,17 @@ define @vp_nearbyint_nxv8f16( %va, @vp_nearbyint_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vp_nearbyint_nxv8f16_unmasked: @@ -763,18 +763,18 @@ define @vp_nearbyint_nxv8f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v12, v8, v12, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv8f16( %va, splat (i1 true), i32 %evl) ret %v @@ -787,19 +787,19 @@ define @vp_nearbyint_nxv16f16( %va, @vp_nearbyint_nxv16f16( %va, @llvm.vp.nearbyint.nxv16f16( %va, %m, i32 %evl) ret %v @@ -833,17 +833,17 @@ define @vp_nearbyint_nxv16f16( %va, @vp_nearbyint_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vp_nearbyint_nxv16f16_unmasked: @@ -851,18 +851,18 @@ define @vp_nearbyint_nxv16f16_unmasked( ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v16, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv16f16( %va, splat (i1 true), i32 %evl) ret %v @@ -875,19 +875,19 @@ define @vp_nearbyint_nxv32f16( %va, @vp_nearbyint_nxv32f16( %va, @vp_nearbyint_nxv32f16( %va, @vp_nearbyint_nxv32f16( %va, @vp_nearbyint_nxv32f16( %va, @vp_nearbyint_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a0) ; ZVFH-NEXT: frflags a0 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu ; ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; ZVFH-NEXT: fsflags a0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16_unmasked: @@ -995,11 +995,12 @@ define @vp_nearbyint_nxv32f16_unmasked( ; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; ZVFHMIN-NEXT: vmset.m v16 ; ZVFHMIN-NEXT: lui a3, 307200 +; ZVFHMIN-NEXT: frflags a4 ; ZVFHMIN-NEXT: slli a1, a2, 1 ; ZVFHMIN-NEXT: srli a2, a2, 2 ; ZVFHMIN-NEXT: fmv.w.x fa5, a3 ; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 ; ZVFHMIN-NEXT: sltu a2, a0, a3 ; ZVFHMIN-NEXT: vmv1r.v v17, v16 @@ -1014,12 +1015,11 @@ define @vp_nearbyint_nxv32f16_unmasked( ; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v17, v8, fa5, v0.t -; ZVFHMIN-NEXT: frflags a2 ; ZVFHMIN-NEXT: vmv1r.v v0, v17 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t -; ZVFHMIN-NEXT: fsflags a2 +; ZVFHMIN-NEXT: fsflags a4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t ; ZVFHMIN-NEXT: vmv1r.v v0, v16 @@ -1033,17 +1033,17 @@ define @vp_nearbyint_nxv32f16_unmasked( ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 -; ZVFHMIN-NEXT: frflags a0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: fsflags a0 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 @@ -1064,15 +1064,15 @@ define @vp_nearbyint_nxv1f32( %va, @llvm.vp.nearbyint.nxv1f32( %va, %m, i32 %evl) ret %v @@ -1085,13 +1085,13 @@ define @vp_nearbyint_nxv1f32_unmasked( ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv1f32( %va, splat (i1 true), i32 %evl) ret %v @@ -1106,15 +1106,15 @@ define @vp_nearbyint_nxv2f32( %va, @llvm.vp.nearbyint.nxv2f32( %va, %m, i32 %evl) ret %v @@ -1127,13 +1127,13 @@ define @vp_nearbyint_nxv2f32_unmasked( ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv2f32( %va, splat (i1 true), i32 %evl) ret %v @@ -1149,16 +1149,16 @@ define @vp_nearbyint_nxv4f32( %va, @llvm.vp.nearbyint.nxv4f32( %va, %m, i32 %evl) ret %v @@ -1171,13 +1171,13 @@ define @vp_nearbyint_nxv4f32_unmasked( ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv4f32( %va, splat (i1 true), i32 %evl) ret %v @@ -1193,16 +1193,16 @@ define @vp_nearbyint_nxv8f32( %va, @llvm.vp.nearbyint.nxv8f32( %va, %m, i32 %evl) ret %v @@ -1215,13 +1215,13 @@ define @vp_nearbyint_nxv8f32_unmasked( ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv8f32( %va, splat (i1 true), i32 %evl) ret %v @@ -1237,16 +1237,16 @@ define @vp_nearbyint_nxv16f32( %va, < ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv16f32( %va, %m, i32 %evl) ret %v @@ -1259,13 +1259,13 @@ define @vp_nearbyint_nxv16f32_unmasked( @llvm.vp.nearbyint.nxv16f32( %va, splat (i1 true), i32 %evl) ret %v @@ -1276,19 +1276,19 @@ declare @llvm.vp.nearbyint.nxv1f64(, define @vp_nearbyint_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a0) +; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv1f64( %va, %m, i32 %evl) ret %v @@ -1297,17 +1297,17 @@ define @vp_nearbyint_nxv1f64( %va, @vp_nearbyint_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv1f64( %va, splat (i1 true), i32 %evl) ret %v @@ -1320,19 +1320,19 @@ define @vp_nearbyint_nxv2f64( %va, @llvm.vp.nearbyint.nxv2f64( %va, %m, i32 %evl) ret %v @@ -1341,17 +1341,17 @@ define @vp_nearbyint_nxv2f64( %va, @vp_nearbyint_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv2f64( %va, splat (i1 true), i32 %evl) ret %v @@ -1364,19 +1364,19 @@ define @vp_nearbyint_nxv4f64( %va, @llvm.vp.nearbyint.nxv4f64( %va, %m, i32 %evl) ret %v @@ -1385,17 +1385,17 @@ define @vp_nearbyint_nxv4f64( %va, @vp_nearbyint_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv4f64( %va, splat (i1 true), i32 %evl) ret %v @@ -1408,19 +1408,19 @@ define @vp_nearbyint_nxv7f64( %va, @llvm.vp.nearbyint.nxv7f64( %va, %m, i32 %evl) ret %v @@ -1429,17 +1429,17 @@ define @vp_nearbyint_nxv7f64( %va, @vp_nearbyint_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv7f64( %va, splat (i1 true), i32 %evl) ret %v @@ -1452,19 +1452,19 @@ define @vp_nearbyint_nxv8f64( %va, @llvm.vp.nearbyint.nxv8f64( %va, %m, i32 %evl) ret %v @@ -1473,17 +1473,17 @@ define @vp_nearbyint_nxv8f64( %va, @vp_nearbyint_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a0) ; CHECK-NEXT: frflags a0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv8f64( %va, splat (i1 true), i32 %evl) ret %v @@ -1495,47 +1495,73 @@ declare @llvm.vp.nearbyint.nxv16f64( @vp_nearbyint_nxv16f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv16f64: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI44_0) ; CHECK-NEXT: srli a3, a1, 3 ; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 +; CHECK-NEXT: vslidedown.vx v25, v0, a3 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: frflags a3 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: frflags a2 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsflags a2 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t +; CHECK-NEXT: fsflags a3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv16f64( %va, %m, i32 %evl) ret %v @@ -1551,13 +1577,13 @@ define @vp_nearbyint_nxv16f64_unmasked( @vp_nearbyint_nxv16f64_unmasked( @llvm.vp.nearbyint.nxv16f64( %va, splat (i1 true), i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll index feb96deb920ff..b83439f6baa22 100644 --- a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll +++ b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll @@ -19,24 +19,23 @@ define signext i32 @foo(i32 signext %aa) #0 { ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: andi sp, sp, -16 ; CHECK-NEXT: mv s1, sp -; CHECK-NEXT: lw t0, 44(s1) +; CHECK-NEXT: sw a0, 52(s1) +; CHECK-NEXT: sw a0, 48(s1) +; CHECK-NEXT: lw a0, 44(s1) ; CHECK-NEXT: lw a2, 40(s1) ; CHECK-NEXT: lw a3, 36(s1) ; CHECK-NEXT: lw a4, 32(s1) ; CHECK-NEXT: lw a5, 28(s1) ; CHECK-NEXT: lw a6, 24(s1) ; CHECK-NEXT: lw a7, 20(s1) -; CHECK-NEXT: lw t1, 16(s1) -; CHECK-NEXT: lw t2, 12(s1) -; CHECK-NEXT: lw t3, 8(s1) -; CHECK-NEXT: sw a0, 52(s1) -; CHECK-NEXT: sw a0, 48(s1) +; CHECK-NEXT: lw a1, 16(s1) +; CHECK-NEXT: lw t0, 12(s1) +; CHECK-NEXT: lw t1, 8(s1) ; CHECK-NEXT: addi sp, sp, -32 +; CHECK-NEXT: sd a1, 0(sp) +; CHECK-NEXT: sd t0, 8(sp) +; CHECK-NEXT: sd t1, 16(sp) ; CHECK-NEXT: addi a1, s1, 48 -; CHECK-NEXT: sd t1, 0(sp) -; CHECK-NEXT: sd t2, 8(sp) -; CHECK-NEXT: sd t3, 16(sp) -; CHECK-NEXT: mv a0, t0 ; CHECK-NEXT: call gfunc ; CHECK-NEXT: addi sp, sp, 32 ; CHECK-NEXT: li a0, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll index 8457f3d2c149c..c6662e092aa5a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll @@ -14,8 +14,8 @@ define @foo( %x, @llvm.vp.fmul.nxv1f64( %x, %y, %m, i32 %vl) diff --git a/llvm/test/CodeGen/RISCV/rvv/pr125306.ll b/llvm/test/CodeGen/RISCV/rvv/pr125306.ll index 111f87de220db..f3ac76eaace6f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr125306.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr125306.ll @@ -19,57 +19,57 @@ define <2 x i32> @main(ptr %0) { ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: sw zero, 80(zero) -; CHECK-NEXT: lui a1, 7 +; CHECK-NEXT: lui a2, 7 ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vid.v v11 ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: lui a5, 2 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v10, (a2) +; CHECK-NEXT: vse32.v v10, (a1) ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: li a2, 24 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: sh zero, -392(a3) ; CHECK-NEXT: sh zero, 534(a3) ; CHECK-NEXT: sh zero, 1460(a3) ; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vse32.v v10, (a2) -; CHECK-NEXT: li a2, 40 +; CHECK-NEXT: vse32.v v10, (a1) +; CHECK-NEXT: li a1, 40 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vadd.vi v9, v11, -1 ; CHECK-NEXT: sh zero, -1710(a5) ; CHECK-NEXT: sh zero, -784(a5) ; CHECK-NEXT: sh zero, 142(a5) -; CHECK-NEXT: lw a5, -304(a1) -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vadd.vi v9, v11, -1 ; CHECK-NEXT: vse32.v v10, (a3) ; CHECK-NEXT: sh zero, 0(a0) -; CHECK-NEXT: lw a0, -188(a1) -; CHECK-NEXT: vse32.v v10, (a2) -; CHECK-NEXT: lw a2, -188(a1) -; CHECK-NEXT: lw a3, 1244(a1) -; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: lw a0, 1244(a1) -; CHECK-NEXT: lw a1, -304(a1) -; CHECK-NEXT: vmv.v.x v10, a3 -; CHECK-NEXT: vmv.v.x v11, a5 +; CHECK-NEXT: vse32.v v10, (a1) +; CHECK-NEXT: lw a0, 1244(a2) +; CHECK-NEXT: lw a1, 1244(a2) +; CHECK-NEXT: lw a3, -188(a2) +; CHECK-NEXT: lw a5, -188(a2) +; CHECK-NEXT: vmv.v.x v8, a3 +; CHECK-NEXT: lw a3, -304(a2) +; CHECK-NEXT: lw a2, -304(a2) +; CHECK-NEXT: sh zero, 0(zero) +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vmv.v.x v11, a3 ; CHECK-NEXT: vslide1down.vx v8, v8, zero ; CHECK-NEXT: vslide1down.vx v10, v10, zero ; CHECK-NEXT: vmin.vv v8, v10, v8 -; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vslide1down.vx v11, v11, zero +; CHECK-NEXT: vmin.vx v10, v10, a5 ; CHECK-NEXT: vmin.vx v10, v10, a2 -; CHECK-NEXT: vmin.vx v10, v10, a1 ; CHECK-NEXT: vmin.vv v11, v8, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: vand.vv v9, v11, v9 -; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: vse32.v v9, (a4) -; CHECK-NEXT: sh zero, 0(zero) +; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: ret entry: store <16 x i32> zeroinitializer, ptr null, align 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll index dbd4224c7ef08..d09b200485092 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll @@ -18,12 +18,11 @@ define <4 x float> @foo(ptr %0) nounwind { ; CHECK-NEXT: lhu a0, 6(a0) ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fmv.w.x fa5, s2 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: fmv.s fa0, fa5 +; CHECK-NEXT: fmv.w.x fa0, s2 ; CHECK-NEXT: call __extendhfsf2 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vfmv.s.f v8, fa0 @@ -36,12 +35,11 @@ define <4 x float> @foo(ptr %0) nounwind { ; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: fmv.w.x fa0, s1 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fmv.w.x fa5, s0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v8, fa0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: fmv.s fa0, fa5 +; CHECK-NEXT: fmv.w.x fa0, s0 ; CHECK-NEXT: call __extendhfsf2 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vfmv.s.f v8, fa0 diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll index 06a357eeaeb61..4be681ec51234 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll @@ -105,8 +105,8 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, @vp_round_nxv1bf16( %va, @vp_round_nxv1bf16_unmasked( % ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -76,12 +76,12 @@ define @vp_round_nxv2bf16( %va, @vp_round_nxv2bf16_unmasked( % ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v8, v9 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -130,12 +130,12 @@ define @vp_round_nxv4bf16( %va, @vp_round_nxv4bf16_unmasked( % ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v8, v10 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -184,12 +184,12 @@ define @vp_round_nxv8bf16( %va, @vp_round_nxv8bf16_unmasked( % ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8 ; CHECK-NEXT: lui a0, 307200 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v8, v12 -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v8, fa5 -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -238,12 +238,12 @@ define @vp_round_nxv16bf16( %va, @vp_round_nxv16bf16_unmasked( @vp_round_nxv32bf16( %va, @vp_round_nxv32bf16( %va, @vp_round_nxv32bf16( %va, @vp_round_nxv32bf16_unmasked( @vp_round_nxv32bf16_unmasked( @vp_round_nxv32bf16_unmasked( @llvm.vp.round.nxv1f16(, @vp_round_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a0) +; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -461,12 +461,12 @@ define @vp_round_nxv1f16( %va, @vp_round_nxv1f16( %va, @vp_round_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -503,11 +503,11 @@ define @vp_round_nxv1f16_unmasked( %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -525,13 +525,13 @@ declare @llvm.vp.round.nxv2f16(, @vp_round_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a0) +; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -547,12 +547,12 @@ define @vp_round_nxv2f16( %va, @vp_round_nxv2f16( %va, @vp_round_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -589,11 +589,11 @@ define @vp_round_nxv2f16_unmasked( %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -611,13 +611,13 @@ declare @llvm.vp.round.nxv4f16(, @vp_round_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a0) +; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -633,12 +633,12 @@ define @vp_round_nxv4f16( %va, @vp_round_nxv4f16( %va, @vp_round_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -675,11 +675,11 @@ define @vp_round_nxv4f16_unmasked( %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -699,12 +699,12 @@ define @vp_round_nxv8f16( %va, @vp_round_nxv8f16( %va, @vp_round_nxv8f16( %va, @vp_round_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -763,11 +763,11 @@ define @vp_round_nxv8f16_unmasked( %va, i ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -787,12 +787,12 @@ define @vp_round_nxv16f16( %va, @vp_round_nxv16f16( %va, @vp_round_nxv16f16( %va, @vp_round_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -851,11 +851,11 @@ define @vp_round_nxv16f16_unmasked( %va ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -875,12 +875,12 @@ define @vp_round_nxv32f16( %va, @vp_round_nxv32f16( %va, @vp_round_nxv32f16( %va, @vp_round_nxv32f16( %va, @vp_round_nxv32f16( %va, @vp_round_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a0) ; ZVFH-NEXT: fsrmi a0, 4 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -995,11 +995,12 @@ define @vp_round_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; ZVFHMIN-NEXT: vmset.m v16 ; ZVFHMIN-NEXT: lui a3, 307200 +; ZVFHMIN-NEXT: fsrmi a4, 4 ; ZVFHMIN-NEXT: slli a1, a2, 1 ; ZVFHMIN-NEXT: srli a2, a2, 2 ; ZVFHMIN-NEXT: fmv.w.x fa5, a3 ; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 ; ZVFHMIN-NEXT: sltu a2, a0, a3 ; ZVFHMIN-NEXT: vmv1r.v v17, v16 @@ -1014,11 +1015,10 @@ define @vp_round_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v17, v8, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a2, 4 ; ZVFHMIN-NEXT: vmv1r.v v0, v17 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t -; ZVFHMIN-NEXT: fsrm a2 +; ZVFHMIN-NEXT: fsrm a4 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t @@ -1033,10 +1033,10 @@ define @vp_round_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 4 ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -1064,9 +1064,9 @@ define @vp_round_nxv1f32( %va, @vp_round_nxv1f32_unmasked( %va, ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1106,9 +1106,9 @@ define @vp_round_nxv2f32( %va, @vp_round_nxv2f32_unmasked( %va, ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1149,9 +1149,9 @@ define @vp_round_nxv4f32( %va, @vp_round_nxv4f32_unmasked( %va, ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -1193,9 +1193,9 @@ define @vp_round_nxv8f32( %va, @vp_round_nxv8f32_unmasked( %va, ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1237,9 +1237,9 @@ define @vp_round_nxv16f32( %va, @vp_round_nxv16f32_unmasked( % ; CHECK-NEXT: vfabs.v v16, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1276,13 +1276,13 @@ declare @llvm.vp.round.nxv1f64(, @vp_round_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a0) +; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -1297,12 +1297,12 @@ define @vp_round_nxv1f64( %va, @vp_round_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1320,12 +1320,12 @@ define @vp_round_nxv2f64( %va, @vp_round_nxv2f64( %va, @vp_round_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -1364,12 +1364,12 @@ define @vp_round_nxv4f64( %va, @vp_round_nxv4f64( %va, @vp_round_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1408,12 +1408,12 @@ define @vp_round_nxv7f64( %va, @vp_round_nxv7f64( %va, @vp_round_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1452,12 +1452,12 @@ define @vp_round_nxv8f64( %va, @vp_round_nxv8f64( %va, @vp_round_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a0) ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1498,59 +1498,66 @@ define @vp_round_nxv16f64( %va, @vp_round_nxv16f64_unmasked( ; CHECK-NEXT: sltu a2, a0, a3 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: fsrmi a3, 4 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16 ; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a2, 4 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: fsrm a3 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t @@ -1585,8 +1592,8 @@ define @vp_round_nxv16f64_unmasked( ; CHECK-NEXT: .LBB45_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 4 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll index df5844277c997..d4043fd8b6816 100644 --- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll @@ -22,12 +22,12 @@ define @vp_roundeven_nxv1bf16( %va, < ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v11, v10, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v8, v11, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -49,11 +49,11 @@ define @vp_roundeven_nxv1bf16_unmasked( @vp_roundeven_nxv2bf16( %va, < ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v11, v10, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v8, v11, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -103,11 +103,11 @@ define @vp_roundeven_nxv2bf16_unmasked( @vp_roundeven_nxv4bf16( %va, < ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v10, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v8, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t @@ -157,11 +157,11 @@ define @vp_roundeven_nxv4bf16_unmasked( @vp_roundeven_nxv8bf16( %va, < ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v12, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t @@ -211,11 +211,11 @@ define @vp_roundeven_nxv8bf16_unmasked( @vp_roundeven_nxv16bf16( %va ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v8, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t @@ -265,11 +265,11 @@ define @vp_roundeven_nxv16bf16_unmasked( @vp_roundeven_nxv32bf16( %va ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: lui a3, 307200 +; CHECK-NEXT: fsrmi a4, 0 ; CHECK-NEXT: slli a1, a2, 1 ; CHECK-NEXT: srli a2, a2, 2 ; CHECK-NEXT: fmv.w.x fa5, a3 @@ -315,11 +316,10 @@ define @vp_roundeven_nxv32bf16( %va ; CHECK-NEXT: vfabs.v v8, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v18, v8, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 0 ; CHECK-NEXT: vmv1r.v v0, v18 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t -; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: fsrm a4 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t @@ -336,11 +336,11 @@ define @vp_roundeven_nxv32bf16( %va ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t ; CHECK-NEXT: vmv1r.v v8, v7 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t @@ -375,11 +375,12 @@ define @vp_roundeven_nxv32bf16_unmasked( @vp_roundeven_nxv32bf16_unmasked( @vp_roundeven_nxv32bf16_unmasked( @llvm.vp.roundeven.nxv1f16(, @vp_roundeven_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a0) +; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -461,12 +461,12 @@ define @vp_roundeven_nxv1f16( %va, @vp_roundeven_nxv1f16( %va, @vp_roundeven_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -503,11 +503,11 @@ define @vp_roundeven_nxv1f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -525,13 +525,13 @@ declare @llvm.vp.roundeven.nxv2f16(, @vp_roundeven_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a0) +; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -547,12 +547,12 @@ define @vp_roundeven_nxv2f16( %va, @vp_roundeven_nxv2f16( %va, @vp_roundeven_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -589,11 +589,11 @@ define @vp_roundeven_nxv2f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -611,13 +611,13 @@ declare @llvm.vp.roundeven.nxv4f16(, @vp_roundeven_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a0) +; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -633,12 +633,12 @@ define @vp_roundeven_nxv4f16( %va, @vp_roundeven_nxv4f16( %va, @vp_roundeven_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -675,11 +675,11 @@ define @vp_roundeven_nxv4f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -699,12 +699,12 @@ define @vp_roundeven_nxv8f16( %va, @vp_roundeven_nxv8f16( %va, @vp_roundeven_nxv8f16( %va, @vp_roundeven_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -763,11 +763,11 @@ define @vp_roundeven_nxv8f16_unmasked( %v ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -787,12 +787,12 @@ define @vp_roundeven_nxv16f16( %va, @vp_roundeven_nxv16f16( %va, @vp_roundeven_nxv16f16( %va, @vp_roundeven_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -851,11 +851,11 @@ define @vp_roundeven_nxv16f16_unmasked( ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v16 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -875,12 +875,12 @@ define @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a0) ; ZVFH-NEXT: fsrmi a0, 0 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -995,11 +995,12 @@ define @vp_roundeven_nxv32f16_unmasked( ; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; ZVFHMIN-NEXT: vmset.m v16 ; ZVFHMIN-NEXT: lui a3, 307200 +; ZVFHMIN-NEXT: fsrmi a4, 0 ; ZVFHMIN-NEXT: slli a1, a2, 1 ; ZVFHMIN-NEXT: srli a2, a2, 2 ; ZVFHMIN-NEXT: fmv.w.x fa5, a3 ; ZVFHMIN-NEXT: sub a3, a0, a1 -; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v16, v16, a2 ; ZVFHMIN-NEXT: sltu a2, a0, a3 ; ZVFHMIN-NEXT: vmv1r.v v17, v16 @@ -1014,11 +1015,10 @@ define @vp_roundeven_nxv32f16_unmasked( ; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v17, v8, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a2, 0 ; ZVFHMIN-NEXT: vmv1r.v v0, v17 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t -; ZVFHMIN-NEXT: fsrm a2 +; ZVFHMIN-NEXT: fsrm a4 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t @@ -1033,10 +1033,10 @@ define @vp_roundeven_nxv32f16_unmasked( ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 +; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16 ; ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 0 ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t @@ -1064,9 +1064,9 @@ define @vp_roundeven_nxv1f32( %va, @vp_roundeven_nxv1f32_unmasked( ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1106,9 +1106,9 @@ define @vp_roundeven_nxv2f32( %va, @vp_roundeven_nxv2f32_unmasked( ; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1149,9 +1149,9 @@ define @vp_roundeven_nxv4f32( %va, @vp_roundeven_nxv4f32_unmasked( ; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -1193,9 +1193,9 @@ define @vp_roundeven_nxv8f32( %va, @vp_roundeven_nxv8f32_unmasked( ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1237,9 +1237,9 @@ define @vp_roundeven_nxv16f32( %va, < ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -1259,8 +1259,8 @@ define @vp_roundeven_nxv16f32_unmasked( @llvm.vp.roundeven.nxv1f64(, define @vp_roundeven_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a0) +; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -1297,12 +1297,12 @@ define @vp_roundeven_nxv1f64( %va, @vp_roundeven_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1320,12 +1320,12 @@ define @vp_roundeven_nxv2f64( %va, @vp_roundeven_nxv2f64( %va, @vp_roundeven_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -1364,12 +1364,12 @@ define @vp_roundeven_nxv4f64( %va, @vp_roundeven_nxv4f64( %va, @vp_roundeven_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1408,12 +1408,12 @@ define @vp_roundeven_nxv7f64( %va, @vp_roundeven_nxv7f64( %va, @vp_roundeven_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1452,12 +1452,12 @@ define @vp_roundeven_nxv8f64( %va, @vp_roundeven_nxv8f64( %va, @vp_roundeven_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a0) ; CHECK-NEXT: fsrmi a0, 0 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1498,59 +1498,66 @@ define @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI44_0) ; CHECK-NEXT: srli a3, a1, 3 ; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 +; CHECK-NEXT: vslidedown.vx v25, v0, a3 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: fsrmi a3, 0 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 0 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a3 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -1570,12 +1577,12 @@ define @vp_roundeven_nxv16f64_unmasked( @vp_roundeven_nxv16f64_unmasked( @vp_roundtozero_nxv1bf16( %va, ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfabs.v v11, v10, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v8, v11, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -49,11 +49,11 @@ define @vp_roundtozero_nxv1bf16_unmasked( @vp_roundtozero_nxv2bf16( %va, ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfabs.v v11, v10, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu ; CHECK-NEXT: vmflt.vf v8, v11, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv.v.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v11, v10, v0.t @@ -103,11 +103,11 @@ define @vp_roundtozero_nxv2bf16_unmasked( @vp_roundtozero_nxv4bf16( %va, ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfabs.v v12, v10, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; CHECK-NEXT: vmflt.vf v8, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t @@ -157,11 +157,11 @@ define @vp_roundtozero_nxv4bf16_unmasked( @vp_roundtozero_nxv8bf16( %va, ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfabs.v v16, v12, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t @@ -211,11 +211,11 @@ define @vp_roundtozero_nxv8bf16_unmasked( @vp_roundtozero_nxv16bf16( % ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: fmv.w.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v8, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t @@ -265,11 +265,11 @@ define @vp_roundtozero_nxv16bf16_unmasked( @vp_roundtozero_nxv32bf16( % ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: lui a3, 307200 +; CHECK-NEXT: fsrmi a4, 1 ; CHECK-NEXT: slli a1, a2, 1 ; CHECK-NEXT: srli a2, a2, 2 ; CHECK-NEXT: fmv.w.x fa5, a3 @@ -315,11 +316,10 @@ define @vp_roundtozero_nxv32bf16( % ; CHECK-NEXT: vfabs.v v8, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v18, v8, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 1 ; CHECK-NEXT: vmv1r.v v0, v18 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t -; CHECK-NEXT: fsrm a2 +; CHECK-NEXT: fsrm a4 ; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v24, v8, v24, v0.t @@ -336,11 +336,11 @@ define @vp_roundtozero_nxv32bf16( % ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t ; CHECK-NEXT: vmv1r.v v8, v7 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t @@ -375,11 +375,12 @@ define @vp_roundtozero_nxv32bf16_unmasked( @vp_roundtozero_nxv32bf16_unmasked( @vp_roundtozero_nxv32bf16_unmasked( @llvm.vp.roundtozero.nxv1f16(, @vp_roundtozero_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI12_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a0) +; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -461,12 +461,12 @@ define @vp_roundtozero_nxv1f16( %va, @vp_roundtozero_nxv1f16( %va, @vp_roundtozero_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI13_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a0) ; ZVFH-NEXT: fsrmi a0, 1 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -503,11 +503,11 @@ define @vp_roundtozero_nxv1f16_unmasked( ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -525,13 +525,13 @@ declare @llvm.vp.roundtozero.nxv2f16(, @vp_roundtozero_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI14_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a0) +; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -547,12 +547,12 @@ define @vp_roundtozero_nxv2f16( %va, @vp_roundtozero_nxv2f16( %va, @vp_roundtozero_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI15_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a0) ; ZVFH-NEXT: fsrmi a0, 1 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -589,11 +589,11 @@ define @vp_roundtozero_nxv2f16_unmasked( ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v9 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v9, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -611,13 +611,13 @@ declare @llvm.vp.roundtozero.nxv4f16(, @vp_roundtozero_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: lui a0, %hi(.LCPI16_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a0) +; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 @@ -633,12 +633,12 @@ define @vp_roundtozero_nxv4f16( %va, @vp_roundtozero_nxv4f16( %va, @vp_roundtozero_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 -; ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a0) ; ZVFH-NEXT: fsrmi a0, 1 +; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -675,11 +675,11 @@ define @vp_roundtozero_nxv4f16_unmasked( ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v10 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v10, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -699,12 +699,12 @@ define @vp_roundtozero_nxv8f16( %va, @vp_roundtozero_nxv8f16( %va, @vp_roundtozero_nxv8f16( %va, @vp_roundtozero_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 -; ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a0) ; ZVFH-NEXT: fsrmi a0, 1 +; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -763,11 +763,11 @@ define @vp_roundtozero_nxv8f16_unmasked( ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: lui a0, 307200 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vfabs.v v8, v12 -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vmflt.vf v0, v8, fa5 -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v12, v0.t ; ZVFHMIN-NEXT: fsrm a0 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t @@ -787,12 +787,12 @@ define @vp_roundtozero_nxv16f16( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vmv1r.v v12, v0 +; ZVFH-NEXT: vfabs.v v16, v8, v0.t ; ZVFH-NEXT: lui a0, %hi(.LCPI20_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a0) -; ZVFH-NEXT: vfabs.v v16, v8, v0.t +; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu ; ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vmv1r.v v0, v12 ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -809,12 +809,12 @@ define @vp_roundtozero_nxv16f16( %va, < ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t ; ZVFHMIN-NEXT: lui a0, 307200 ; ZVFHMIN-NEXT: vmv1r.v v8, v0 +; ZVFHMIN-NEXT: fmv.w.x fa5, a0 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t -; ZVFHMIN-NEXT: fmv.w.x fa5, a0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v24, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t @@ -833,12 +833,12 @@ define @vp_roundtozero_nxv16f16( %va, < define @vp_roundtozero_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 -; ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI21_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a0) ; ZVFH-NEXT: fsrmi a0, 1 +; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -851,11 +851,11 @@ define @vp_roundtozero_nxv16f16_unmasked( @vp_roundtozero_nxv32f16( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vmv1r.v v16, v0 +; ZVFH-NEXT: vfabs.v v24, v8, v0.t ; ZVFH-NEXT: lui a0, %hi(.LCPI22_0) ; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a0) -; ZVFH-NEXT: vfabs.v v24, v8, v0.t +; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu ; ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t -; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vmv1r.v v0, v16 ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -902,6 +902,7 @@ define @vp_roundtozero_nxv32f16( %va, < ; ZVFHMIN-NEXT: vmv1r.v v7, v0 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: lui a3, 307200 +; ZVFHMIN-NEXT: fsrmi a4, 1 ; ZVFHMIN-NEXT: slli a1, a2, 1 ; ZVFHMIN-NEXT: srli a2, a2, 2 ; ZVFHMIN-NEXT: fmv.w.x fa5, a3 @@ -920,11 +921,10 @@ define @vp_roundtozero_nxv32f16( %va, < ; ZVFHMIN-NEXT: vfabs.v v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v18, v8, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a2, 1 ; ZVFHMIN-NEXT: vmv1r.v v0, v18 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v8, v24, v0.t -; ZVFHMIN-NEXT: fsrm a2 +; ZVFHMIN-NEXT: fsrm a4 ; ZVFHMIN-NEXT: vfcvt.f.x.v v8, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vfsgnj.vv v24, v8, v24, v0.t @@ -941,11 +941,11 @@ define @vp_roundtozero_nxv32f16( %va, < ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: vmv1r.v v8, v7 +; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfabs.v v16, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; ZVFHMIN-NEXT: vmflt.vf v8, v16, fa5, v0.t -; ZVFHMIN-NEXT: fsrmi a0, 1 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfcvt.x.f.v v16, v24, v0.t @@ -970,12 +970,12 @@ define @vp_roundtozero_nxv32f16( %va, < define @vp_roundtozero_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 -; ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; ZVFH-NEXT: lui a0, %hi(.LCPI23_0) +; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a0) ; ZVFH-NEXT: fsrmi a0, 1 +; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: fsrm a0 ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -995,11 +995,12 @@ define @vp_roundtozero_nxv32f16_unmasked( @vp_roundtozero_nxv32f16_unmasked( @vp_roundtozero_nxv32f16_unmasked( @vp_roundtozero_nxv1f32( %va, @vp_roundtozero_nxv1f32_unmasked( @vp_roundtozero_nxv2f32( %va, @vp_roundtozero_nxv2f32_unmasked( @vp_roundtozero_nxv4f32( %va, @vp_roundtozero_nxv4f32_unmasked( @vp_roundtozero_nxv8f32( %va, @vp_roundtozero_nxv8f32_unmasked( @vp_roundtozero_nxv16f32( %va, ; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -1259,8 +1259,8 @@ define @vp_roundtozero_nxv16f32_unmasked( @llvm.vp.roundtozero.nxv1f64( define @vp_roundtozero_nxv1f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: lui a0, %hi(.LCPI34_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a0) +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 @@ -1297,12 +1297,12 @@ define @vp_roundtozero_nxv1f64( %va, define @vp_roundtozero_nxv1f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv1f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI35_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a0) ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1320,12 +1320,12 @@ define @vp_roundtozero_nxv2f64( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: vfabs.v v12, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI36_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -1341,12 +1341,12 @@ define @vp_roundtozero_nxv2f64( %va, define @vp_roundtozero_nxv2f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI37_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a0) ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -1364,12 +1364,12 @@ define @vp_roundtozero_nxv4f64( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmv1r.v v12, v0 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI38_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -1385,12 +1385,12 @@ define @vp_roundtozero_nxv4f64( %va, define @vp_roundtozero_nxv4f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI39_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a0) ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1408,12 +1408,12 @@ define @vp_roundtozero_nxv7f64( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI40_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -1429,12 +1429,12 @@ define @vp_roundtozero_nxv7f64( %va, define @vp_roundtozero_nxv7f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv7f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI41_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a0) ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1452,12 +1452,12 @@ define @vp_roundtozero_nxv8f64( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v16, v0 +; CHECK-NEXT: vfabs.v v24, v8, v0.t ; CHECK-NEXT: lui a0, %hi(.LCPI42_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t @@ -1473,12 +1473,12 @@ define @vp_roundtozero_nxv8f64( %va, define @vp_roundtozero_nxv8f64_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_nxv8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: lui a0, %hi(.LCPI43_0) +; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a0) ; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1498,59 +1498,66 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI44_0) ; CHECK-NEXT: srli a3, a1, 3 ; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) ; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 +; CHECK-NEXT: vslidedown.vx v25, v0, a3 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: fsrmi a3, 1 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 1 -; CHECK-NEXT: vmv1r.v v0, v6 +; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: fsrm a3 +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB44_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v7 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -1570,12 +1577,12 @@ define @vp_roundtozero_nxv16f64_unmasked( @vp_roundtozero_nxv16f64_unmasked( @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, %w, %x, %y, %z) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: -; CHECK-NEXT: ld a0, 0(sp) -; CHECK-NEXT: ld a1, 8(sp) +; CHECK-NEXT: ld a0, 8(sp) ; CHECK-NEXT: vl8re32.v v24, (a0) -; CHECK-NEXT: vl8re32.v v0, (a1) +; CHECK-NEXT: ld a0, 0(sp) +; CHECK-NEXT: vl8re32.v v0, (a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v24 -; CHECK-NEXT: vadd.vv v16, v16, v0 +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v16, v24 ; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %s0 = add %w, %y diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir index 936fa21763eba..a050034c63168 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir @@ -58,15 +58,13 @@ body: | ; CHECK-NEXT: %true:vr = COPY $v9 ; CHECK-NEXT: %avl:gprnox0 = COPY $x1 ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 /* e8 */ - ; CHECK-NEXT: $v0 = COPY %mask ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, %avl, 5 /* e32 */, 0 /* tu, mu */ %false:vr = COPY $v8 %pt:vrnov0 = COPY $v8 %true:vr = COPY $v9 %avl:gprnox0 = COPY $x1 %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 - $v0 = COPY %mask - %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, %avl, 5 + %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl, 5 ... --- name: same_mask @@ -78,18 +76,14 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %pt:vr = COPY $v8 ; CHECK-NEXT: %false:vrnov0 = COPY $v9 - ; CHECK-NEXT: %mask:vr = COPY $v0 - ; CHECK-NEXT: $v0 = COPY %mask - ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ - ; CHECK-NEXT: $v0 = COPY %mask + ; CHECK-NEXT: %mask:vmv0 = COPY $v0 + ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 8, 5 /* e32 */, 0 /* tu, mu */ %pt:vrnov0 = COPY $v8 %false:vrnov0 = COPY $v9 - %mask:vr = COPY $v0 - $v0 = COPY %mask - %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ - $v0 = COPY %mask - %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */ + %mask:vmv0 = COPY $v0 + %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ + %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */ ... --- # Shouldn't be converted because false operands are different @@ -102,18 +96,14 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %pt:vrnov0 = COPY $v8 ; CHECK-NEXT: %false:vrnov0 = COPY $v9 - ; CHECK-NEXT: %mask:vr = COPY $v0 - ; CHECK-NEXT: $v0 = COPY %mask - ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %pt, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ - ; CHECK-NEXT: $v0 = COPY %mask - ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */ + ; CHECK-NEXT: %mask:vmv0 = COPY $v0 + ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %pt, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */ %pt:vrnov0 = COPY $v8 %false:vrnov0 = COPY $v9 - %mask:vr = COPY $v0 - $v0 = COPY %mask - %true:vrnov0 = PseudoVADD_VV_M1_MASK %pt, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ - $v0 = COPY %mask - %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */ + %mask:vmv0 = COPY $v0 + %true:vrnov0 = PseudoVADD_VV_M1_MASK %pt, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ + %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */ ... --- # Shouldn't be converted because EEWs are different @@ -126,18 +116,14 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %pt:vrnov0 = COPY $v8 ; CHECK-NEXT: %false:vrnov0 = COPY $v9 - ; CHECK-NEXT: %mask:vr = COPY $v0 - ; CHECK-NEXT: $v0 = COPY %mask - ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 4 /* e16 */, 0 /* tu, mu */ - ; CHECK-NEXT: $v0 = COPY %mask - ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */ + ; CHECK-NEXT: %mask:vmv0 = COPY $v0 + ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */ %pt:vrnov0 = COPY $v8 %false:vrnov0 = COPY $v9 - %mask:vr = COPY $v0 - $v0 = COPY %mask - %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 4 /* e16 */, 0 /* tu, mu */ - $v0 = COPY %mask - %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, 8, 5 /* e32 */ + %mask:vmv0 = COPY $v0 + %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 4 /* e16 */, 0 /* tu, mu */ + %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */ ... --- name: same_mask_undef_truepassthru @@ -148,16 +134,12 @@ body: | ; CHECK: liveins: $v8, $v0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %false:vrnov0 = COPY $v8 - ; CHECK-NEXT: %mask:vr = COPY $v0 - ; CHECK-NEXT: $v0 = COPY %mask - ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, $v0, 4, 5 /* e32 */, 1 /* ta, mu */ - ; CHECK-NEXT: $v0 = COPY %mask + ; CHECK-NEXT: %mask:vmv0 = COPY $v0 + ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 1 /* ta, mu */ %false:vr = COPY $v8 - %mask:vr = COPY $v0 - $v0 = COPY %mask - %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ - $v0 = COPY %mask - %x:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, $v0, 4, 5 /* e32 */ + %mask:vmv0 = COPY $v0 + %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ + %x:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */ ... --- # Shouldn't be converted because true is in a different block @@ -169,19 +151,15 @@ body: | ; CHECK-NEXT: liveins: $v8, $v0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %false:vr = COPY $v8 - ; CHECK-NEXT: %mask:vr = COPY $v0 - ; CHECK-NEXT: $v0 = COPY %mask - ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %mask:vmv0 = COPY $v0 + ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: $v0 = COPY %mask - ; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, $v0, 4, 5 /* e32 */ + ; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */ bb.0: liveins: $v8, $v0 %false:vr = COPY $v8 - %mask:vr = COPY $v0 - $v0 = COPY %mask - %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, $v0, 4, 5 /* e32 */, 0 /* tu, mu */ + %mask:vmv0 = COPY $v0 + %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ bb.1: - $v0 = COPY %mask - %5:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, $v0, 4, 5 /* e32 */ + %5:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */ diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll index c01cbf49483b7..0c058b562f53d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll @@ -15,8 +15,8 @@ define void @vpmerge_vpload_store( %passthru, ptr %p, ) into %ir.p) ; CHECK-NEXT: PseudoRET %a = call @llvm.vp.load.nxv2i32.p0(ptr %p, splat (i1 -1), i32 %vl) @@ -34,8 +34,8 @@ define void @vpselect_vpload_store( %passthru, ptr %p, ) into %ir.p) ; CHECK-NEXT: PseudoRET %a = call @llvm.vp.load.nxv2i32.p0(ptr %p, splat (i1 -1), i32 %vl) diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll index 403cc0eb9dce1..f6417fa29ea0f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll @@ -969,8 +969,8 @@ define @vfredusum( %passthru, @llvm.riscv.vfredusum.nxv2f32.nxv2f32( %passthru, diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index e6272701a6033..9a4121b41c3f3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -1358,13 +1358,10 @@ define @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64bf16( %va, @fcmp_oeq_vv_nxv64bf16( %va, @llvm.vp.fcmp.nxv64f16(, @fcmp_oeq_vv_nxv64f16( %va, %vb, %m, i32 zeroext %evl) { ; ZVFH-LABEL: fcmp_oeq_vv_nxv64f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: addi sp, sp, -16 -; ZVFH-NEXT: .cfi_def_cfa_offset 16 -; ZVFH-NEXT: csrr a1, vlenb -; ZVFH-NEXT: slli a1, a1, 4 -; ZVFH-NEXT: sub sp, sp, a1 -; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; ZVFH-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; ZVFH-NEXT: vmv1r.v v24, v0 +; ZVFH-NEXT: vmv1r.v v7, v0 ; ZVFH-NEXT: csrr a1, vlenb -; ZVFH-NEXT: slli a1, a1, 3 -; ZVFH-NEXT: add a1, sp, a1 -; ZVFH-NEXT: addi a1, a1, 16 -; ZVFH-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; ZVFH-NEXT: csrr a3, vlenb -; ZVFH-NEXT: srli a1, a3, 1 -; ZVFH-NEXT: slli a4, a3, 3 -; ZVFH-NEXT: slli a3, a3, 2 +; ZVFH-NEXT: slli a4, a1, 3 +; ZVFH-NEXT: slli a3, a1, 2 ; ZVFH-NEXT: add a4, a0, a4 ; ZVFH-NEXT: sub a5, a2, a3 -; ZVFH-NEXT: vl8re16.v v8, (a4) +; ZVFH-NEXT: vl8re16.v v24, (a4) ; ZVFH-NEXT: sltu a4, a2, a5 ; ZVFH-NEXT: addi a4, a4, -1 -; ZVFH-NEXT: vl8re16.v v0, (a0) -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; ZVFH-NEXT: vslidedown.vx v0, v24, a1 ; ZVFH-NEXT: and a4, a4, a5 +; ZVFH-NEXT: srli a1, a1, 1 +; ZVFH-NEXT: vslidedown.vx v0, v0, a1 ; ZVFH-NEXT: vsetvli zero, a4, e16, m8, ta, ma -; ZVFH-NEXT: vmfeq.vv v7, v16, v8, v0.t +; ZVFH-NEXT: vmfeq.vv v6, v16, v24, v0.t +; ZVFH-NEXT: vl8re16.v v24, (a0) ; ZVFH-NEXT: bltu a2, a3, .LBB171_2 ; ZVFH-NEXT: # %bb.1: ; ZVFH-NEXT: mv a2, a3 ; ZVFH-NEXT: .LBB171_2: -; ZVFH-NEXT: vmv1r.v v0, v24 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; ZVFH-NEXT: addi a0, sp, 16 -; ZVFH-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVFH-NEXT: vmv1r.v v0, v7 ; ZVFH-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; ZVFH-NEXT: vmfeq.vv v16, v8, v24, v0.t ; ZVFH-NEXT: add a0, a1, a1 ; ZVFH-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; ZVFH-NEXT: vslideup.vx v16, v7, a1 +; ZVFH-NEXT: vslideup.vx v16, v6, a1 ; ZVFH-NEXT: vmv.v.v v0, v16 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add sp, sp, a0 -; ZVFH-NEXT: .cfi_def_cfa sp, 16 -; ZVFH-NEXT: addi sp, sp, 16 -; ZVFH-NEXT: .cfi_def_cfa_offset 0 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv64f16: @@ -3558,13 +3522,10 @@ define @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @fcmp_oeq_vv_nxv64f16( %va, @llvm.vp.icmp.nxv128i8(, @icmp_eq_vv_nxv128i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_nxv128i8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: sub a4, a3, a1 -; CHECK-NEXT: vl8r.v v8, (a2) +; CHECK-NEXT: vl8r.v v24, (a2) ; CHECK-NEXT: sltu a2, a3, a4 -; CHECK-NEXT: vl8r.v v24, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a2, a2, a4 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmseq.vv v6, v16, v8, v0.t +; CHECK-NEXT: vmseq.vv v6, v16, v24, v0.t +; CHECK-NEXT: vl8r.v v24, (a0) ; CHECK-NEXT: bltu a3, a1, .LBB96_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a1 ; CHECK-NEXT: .LBB96_2: ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma ; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmv1r.v v8, v6 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv128i8( %va, %vb, metadata !"eq", %m, i32 %evl) ret %v @@ -2223,59 +2197,33 @@ declare @llvm.vp.icmp.nxv32i32(, @icmp_eq_vv_nxv32i32( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vv_nxv32i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: srli a1, a3, 2 -; CHECK-NEXT: slli a4, a3, 3 -; CHECK-NEXT: slli a3, a3, 1 +; CHECK-NEXT: slli a4, a1, 3 +; CHECK-NEXT: slli a3, a1, 1 ; CHECK-NEXT: add a4, a0, a4 ; CHECK-NEXT: sub a5, a2, a3 -; CHECK-NEXT: vl8re32.v v8, (a4) +; CHECK-NEXT: vl8re32.v v24, (a4) ; CHECK-NEXT: sltu a4, a2, a5 ; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: vl8re32.v v0, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a1 ; CHECK-NEXT: and a4, a4, a5 +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: vslidedown.vx v0, v0, a1 ; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; CHECK-NEXT: vmseq.vv v7, v16, v8, v0.t +; CHECK-NEXT: vmseq.vv v6, v16, v24, v0.t +; CHECK-NEXT: vl8re32.v v24, (a0) ; CHECK-NEXT: bltu a2, a3, .LBB189_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a3 ; CHECK-NEXT: .LBB189_2: -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t ; CHECK-NEXT: add a0, a1, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v16, v7, a1 +; CHECK-NEXT: vslideup.vx v16, v6, a1 ; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.icmp.nxv32i32( %va, %vb, metadata !"eq", %m, i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll b/llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll index d12f2c889650f..eb6635117d0a1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll @@ -17,8 +17,8 @@ define void @vecaddr_straightline(i32 zeroext %a, ptr %p) { ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vle32.v v8, (s0) ; RV32-NEXT: vadd.vi v8, v8, 1 -; RV32-NEXT: li a1, 57 ; RV32-NEXT: vse32.v v8, (s0) +; RV32-NEXT: li a1, 57 ; RV32-NEXT: beq a0, a1, .LBB0_2 ; RV32-NEXT: # %bb.1: # %do_call ; RV32-NEXT: call foo @@ -47,8 +47,8 @@ define void @vecaddr_straightline(i32 zeroext %a, ptr %p) { ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vle32.v v8, (s0) ; RV64-NEXT: vadd.vi v8, v8, 1 -; RV64-NEXT: li a1, 57 ; RV64-NEXT: vse32.v v8, (s0) +; RV64-NEXT: li a1, 57 ; RV64-NEXT: beq a0, a1, .LBB0_2 ; RV64-NEXT: # %bb.1: # %do_call ; RV64-NEXT: call foo @@ -97,8 +97,8 @@ define void @vecaddr_loop(i32 zeroext %a, ptr %p) { ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vle32.v v8, (s0) ; RV32-NEXT: vadd.vi v8, v8, 1 -; RV32-NEXT: li a1, 57 ; RV32-NEXT: vse32.v v8, (s0) +; RV32-NEXT: li a1, 57 ; RV32-NEXT: beq a0, a1, .LBB1_2 ; RV32-NEXT: .LBB1_1: # %do_call ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 @@ -129,8 +129,8 @@ define void @vecaddr_loop(i32 zeroext %a, ptr %p) { ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vle32.v v8, (s0) ; RV64-NEXT: vadd.vi v8, v8, 1 -; RV64-NEXT: li a1, 57 ; RV64-NEXT: vse32.v v8, (s0) +; RV64-NEXT: li a1, 57 ; RV64-NEXT: beq a0, a1, .LBB1_2 ; RV64-NEXT: .LBB1_1: # %do_call ; RV64-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index 1948675ae9cf0..c0792566160ba 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -1457,19 +1457,19 @@ for.cond.cleanup: ; preds = %vector.body define void @sink_splat_fmul_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fmul_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB26_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: bgeu a1, a3, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: j .LBB26_5 ; CHECK-NEXT: .LBB26_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a1, a3, -1 +; CHECK-NEXT: andi a4, a1, 1024 +; CHECK-NEXT: xori a1, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB26_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1477,12 +1477,12 @@ define void @sink_splat_fmul_scalable(ptr nocapture %a, float %x) { ; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfmul.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: add a5, a5, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: bnez a6, .LBB26_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB26_7 ; CHECK-NEXT: .LBB26_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a2, 2 +; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 @@ -1547,19 +1547,19 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fdiv_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB27_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: bgeu a1, a3, .LBB27_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: j .LBB27_5 ; CHECK-NEXT: .LBB27_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a1, a3, -1 +; CHECK-NEXT: andi a4, a1, 1024 +; CHECK-NEXT: xori a1, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB27_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1567,12 +1567,12 @@ define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfdiv.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: add a5, a5, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: bnez a6, .LBB27_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB27_7 ; CHECK-NEXT: .LBB27_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a2, 2 +; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 @@ -1637,19 +1637,19 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frdiv_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB28_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: bgeu a1, a3, .LBB28_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: j .LBB28_5 ; CHECK-NEXT: .LBB28_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a1, a3, -1 +; CHECK-NEXT: andi a4, a1, 1024 +; CHECK-NEXT: xori a1, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB28_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1657,12 +1657,12 @@ define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfrdiv.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: add a5, a5, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: bnez a6, .LBB28_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB28_7 ; CHECK-NEXT: .LBB28_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a2, 2 +; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 @@ -1727,19 +1727,19 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fadd_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB29_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: bgeu a1, a3, .LBB29_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: j .LBB29_5 ; CHECK-NEXT: .LBB29_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a1, a3, -1 +; CHECK-NEXT: andi a4, a1, 1024 +; CHECK-NEXT: xori a1, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB29_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1747,12 +1747,12 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfadd.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: add a5, a5, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: bnez a6, .LBB29_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB29_7 ; CHECK-NEXT: .LBB29_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a2, 2 +; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 @@ -1817,19 +1817,19 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fsub_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fsub_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB30_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: bgeu a1, a3, .LBB30_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: j .LBB30_5 ; CHECK-NEXT: .LBB30_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a1, a3, -1 +; CHECK-NEXT: andi a4, a1, 1024 +; CHECK-NEXT: xori a1, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB30_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1837,12 +1837,12 @@ define void @sink_splat_fsub_scalable(ptr nocapture %a, float %x) { ; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfsub.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: add a5, a5, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: bnez a6, .LBB30_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB30_7 ; CHECK-NEXT: .LBB30_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a2, 2 +; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 @@ -1907,19 +1907,19 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_frsub_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frsub_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB31_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: bgeu a1, a3, .LBB31_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: j .LBB31_5 ; CHECK-NEXT: .LBB31_2: # %vector.ph -; CHECK-NEXT: addi a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 +; CHECK-NEXT: addi a1, a3, -1 +; CHECK-NEXT: andi a4, a1, 1024 +; CHECK-NEXT: xori a1, a4, 1024 ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a2 +; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB31_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1927,12 +1927,12 @@ define void @sink_splat_frsub_scalable(ptr nocapture %a, float %x) { ; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfrsub.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: add a5, a5, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: bnez a6, .LBB31_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB31_7 ; CHECK-NEXT: .LBB31_5: # %for.body.preheader -; CHECK-NEXT: slli a1, a2, 2 +; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: add a0, a0, a2 @@ -2073,35 +2073,35 @@ for.cond.cleanup: ; preds = %vector.body define void @sink_splat_fma_scalable(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, float %x) { ; CHECK-LABEL: sink_splat_fma_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a4, a2, 2 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a4, .LBB34_2 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: srli a4, a3, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a4, .LBB34_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB34_5 ; CHECK-NEXT: .LBB34_2: # %vector.ph -; CHECK-NEXT: addi a3, a4, -1 -; CHECK-NEXT: andi a5, a3, 1024 -; CHECK-NEXT: xori a3, a5, 1024 +; CHECK-NEXT: addi a2, a4, -1 +; CHECK-NEXT: andi a5, a2, 1024 +; CHECK-NEXT: xori a2, a5, 1024 ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: mv a7, a1 -; CHECK-NEXT: mv t0, a3 +; CHECK-NEXT: mv t0, a2 ; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB34_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a6) ; CHECK-NEXT: vl1re32.v v9, (a7) ; CHECK-NEXT: sub t0, t0, a4 -; CHECK-NEXT: add a7, a7, a2 +; CHECK-NEXT: add a7, a7, a3 ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vs1r.v v9, (a6) -; CHECK-NEXT: add a6, a6, a2 +; CHECK-NEXT: add a6, a6, a3 ; CHECK-NEXT: bnez t0, .LBB34_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a5, .LBB34_7 ; CHECK-NEXT: .LBB34_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: add a2, a1, a2 @@ -2173,35 +2173,35 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fma_commute_scalable(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, float %x) { ; CHECK-LABEL: sink_splat_fma_commute_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a4, a2, 2 -; CHECK-NEXT: li a3, 1024 -; CHECK-NEXT: bgeu a3, a4, .LBB35_2 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: srli a4, a3, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a4, .LBB35_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB35_5 ; CHECK-NEXT: .LBB35_2: # %vector.ph -; CHECK-NEXT: addi a3, a4, -1 -; CHECK-NEXT: andi a5, a3, 1024 -; CHECK-NEXT: xori a3, a5, 1024 +; CHECK-NEXT: addi a2, a4, -1 +; CHECK-NEXT: andi a5, a2, 1024 +; CHECK-NEXT: xori a2, a5, 1024 ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: mv a7, a1 -; CHECK-NEXT: mv t0, a3 +; CHECK-NEXT: mv t0, a2 ; CHECK-NEXT: vsetvli t1, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB35_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a6) ; CHECK-NEXT: vl1re32.v v9, (a7) ; CHECK-NEXT: sub t0, t0, a4 -; CHECK-NEXT: add a7, a7, a2 +; CHECK-NEXT: add a7, a7, a3 ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vs1r.v v9, (a6) -; CHECK-NEXT: add a6, a6, a2 +; CHECK-NEXT: add a6, a6, a3 ; CHECK-NEXT: bnez t0, .LBB35_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a5, .LBB35_7 ; CHECK-NEXT: .LBB35_5: # %for.body.preheader -; CHECK-NEXT: slli a2, a3, 2 +; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: add a2, a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll index 62339130678d0..86cf1ee04b60a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll @@ -561,14 +561,14 @@ define @add_stepvector_nxv16i64() { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vid.v v8 ; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vid.v v8 -; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vadd.vv v16, v8, v16 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -596,16 +596,16 @@ define @mul_stepvector_nxv16i64() { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vid.v v8 +; RV32-NEXT: li a1, 3 +; RV32-NEXT: vmul.vx v8, v8, a1 ; RV32-NEXT: slli a1, a0, 1 ; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vid.v v8 -; RV32-NEXT: li a0, 3 -; RV32-NEXT: vmul.vx v8, v8, a0 ; RV32-NEXT: vadd.vv v16, v8, v16 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -649,16 +649,16 @@ define @mul_bigimm_stepvector_nxv16i64() { ; RV32-NEXT: slli a3, a0, 1 ; RV32-NEXT: slli a0, a0, 6 ; RV32-NEXT: sub a0, a0, a3 +; RV32-NEXT: addi a3, sp, 8 +; RV32-NEXT: vsetvli a4, zero, e64, m8, ta, ma +; RV32-NEXT: vid.v v8 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: mv a1, sp ; RV32-NEXT: sw a2, 0(sp) ; RV32-NEXT: sw a0, 4(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: mv a0, sp -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vid.v v24 -; RV32-NEXT: vmul.vv v8, v24, v8 +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vadd.vv v16, v8, v16 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -689,14 +689,14 @@ define @shl_stepvector_nxv16i64() { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV32-NEXT: vid.v v8 ; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vid.v v8 -; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vadd.vv v16, v8, v16 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll index a8934bb25571c..081afcfab8dae 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll @@ -16,8 +16,8 @@ define @strided_vpload_nxv1i8_i8(ptr %ptr, i8 signext %stride, ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr = COPY $v0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x11 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $x10 - ; CHECK-NEXT: $v0 = COPY [[COPY1]] - ; CHECK-NEXT: [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], $v0, [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]] + ; CHECK-NEXT: [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1) ; CHECK-NEXT: $v8 = COPY [[PseudoVLSE8_V_MF8_MASK]] ; CHECK-NEXT: PseudoRET implicit $v8 %load = call @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 %stride, %m, i32 %evl) @@ -36,8 +36,8 @@ define void @strided_vpstore_nxv1i8_i8( %val, ptr %ptr, i8 sign ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x11 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $x10 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vr = COPY $v8 - ; CHECK-NEXT: $v0 = COPY [[COPY1]] - ; CHECK-NEXT: PseudoVSSE8_V_MF8_MASK [[COPY4]], [[COPY3]], [[COPY2]], $v0, [[COPY]], 3 /* e8 */ :: (store unknown-size, align 1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vmv0 = COPY [[COPY1]] + ; CHECK-NEXT: PseudoVSSE8_V_MF8_MASK [[COPY4]], [[COPY3]], [[COPY2]], [[COPY5]], [[COPY]], 3 /* e8 */ :: (store unknown-size, align 1) ; CHECK-NEXT: PseudoRET call void @llvm.experimental.vp.strided.store.nxv1i8.p0.i8( %val, ptr %ptr, i8 %stride, %m, i32 %evl) ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll index ecd098edb30ae..881a8795cc504 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll @@ -676,9 +676,9 @@ define @strided_load_nxv16f64(ptr %ptr, i64 %stride, @strided_load_nxv16f64(ptr %ptr, i64 %stride, @strided_load_nxv17f64(ptr %ptr, i64 %stride, @strided_load_nxv17f64(ptr %ptr, i64 %stride, @strided_load_nxv17f64(ptr %ptr, i64 %stride, @strided_load_nxv17f64(ptr %ptr, i64 %stride, %v, ptr %ptr, i32 sig ; CHECK-NEXT: mul a4, a4, a1 ; CHECK-NEXT: srli a3, a3, 3 ; CHECK-NEXT: sltu a2, a2, a5 -; CHECK-NEXT: vsetvli a6, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v0, a3 +; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a2, a2, a5 -; CHECK-NEXT: add a0, a0, a4 +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a3 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v16, (a0), a1, v0.t ; CHECK-NEXT: ret @@ -646,21 +646,21 @@ define void @strided_store_nxv17f64( %v, ptr %ptr, i32 sig ; CHECK-NEXT: sltu a3, a3, a6 ; CHECK-NEXT: addi t0, t0, -1 ; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and t0, t0, a0 -; CHECK-NEXT: and a0, a3, a6 -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload -; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma +; CHECK-NEXT: and a0, t0, a0 +; CHECK-NEXT: addi t0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (t0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v8, (a7), a2, v0.t +; CHECK-NEXT: and a0, a3, a6 ; CHECK-NEXT: bltu a0, a4, .LBB48_6 ; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: mv a0, a4 ; CHECK-NEXT: .LBB48_6: ; CHECK-NEXT: mul a3, a5, a2 ; CHECK-NEXT: srli a4, a4, 2 -; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, a4 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v16, (a1), a2, v0.t ; CHECK-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll index 68e0c0089d0c7..a5dd27149c1f2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/umulo-sdnode.ll @@ -7,10 +7,10 @@ define @umulo_nxv1i8( %x, % ; CHECK-LABEL: umulo_nxv1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma -; CHECK-NEXT: vmulhu.vv v10, v8, v9 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vmul.vv v8, v8, v9 -; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: vmul.vv v10, v8, v9 +; CHECK-NEXT: vmulhu.vv v8, v8, v9 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret %a = call { , } @llvm.umul.with.overflow.nxv1i8( %x, %y) %b = extractvalue { , } %a, 0 @@ -25,10 +25,10 @@ define @umulo_nxv2i8( %x, % ; CHECK-LABEL: umulo_nxv2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmulhu.vv v10, v8, v9 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vmul.vv v8, v8, v9 -; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: vmul.vv v10, v8, v9 +; CHECK-NEXT: vmulhu.vv v8, v8, v9 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret %a = call { , } @llvm.umul.with.overflow.nxv2i8( %x, %y) %b = extractvalue { , } %a, 0 @@ -43,10 +43,10 @@ define @umulo_nxv4i8( %x, % ; CHECK-LABEL: umulo_nxv4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma -; CHECK-NEXT: vmulhu.vv v10, v8, v9 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vmul.vv v8, v8, v9 -; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: vmul.vv v10, v8, v9 +; CHECK-NEXT: vmulhu.vv v8, v8, v9 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret %a = call { , } @llvm.umul.with.overflow.nxv4i8( %x, %y) %b = extractvalue { , } %a, 0 @@ -61,10 +61,10 @@ define @umulo_nxv8i8( %x, % ; CHECK-LABEL: umulo_nxv8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; CHECK-NEXT: vmulhu.vv v10, v8, v9 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vmul.vv v8, v8, v9 -; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: vmul.vv v10, v8, v9 +; CHECK-NEXT: vmulhu.vv v8, v8, v9 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 ; CHECK-NEXT: ret %a = call { , } @llvm.umul.with.overflow.nxv8i8( %x, %y) %b = extractvalue { , } %a, 0 @@ -79,10 +79,10 @@ define @umulo_nxv16i8( %x, , } @llvm.umul.with.overflow.nxv16i8( %x, %y) %b = extractvalue { , } %a, 0 @@ -97,10 +97,10 @@ define @umulo_nxv32i8( %x, , } @llvm.umul.with.overflow.nxv32i8( %x, %y) %b = extractvalue { , } %a, 0 @@ -115,10 +115,10 @@ define @umulo_nxv64i8( %x, , } @llvm.umul.with.overflow.nxv64i8( %x, %y) %b = extractvalue { , } %a, 0 @@ -133,10 +133,10 @@ define @umulo_nxv1i16( %x, , } @llvm.umul.with.overflow.nxv1i16( %x, %y) %b = extractvalue { , } %a, 0 @@ -151,10 +151,10 @@ define @umulo_nxv2i16( %x, , } @llvm.umul.with.overflow.nxv2i16( %x, %y) %b = extractvalue { , } %a, 0 @@ -169,10 +169,10 @@ define @umulo_nxv4i16( %x, , } @llvm.umul.with.overflow.nxv4i16( %x, %y) %b = extractvalue { , } %a, 0 @@ -187,10 +187,10 @@ define @umulo_nxv8i16( %x, , } @llvm.umul.with.overflow.nxv8i16( %x, %y) %b = extractvalue { , } %a, 0 @@ -205,10 +205,10 @@ define @umulo_nxv16i16( %x, , } @llvm.umul.with.overflow.nxv16i16( %x, %y) %b = extractvalue { , } %a, 0 @@ -223,10 +223,10 @@ define @umulo_nxv32i16( %x, , } @llvm.umul.with.overflow.nxv32i16( %x, %y) %b = extractvalue { , } %a, 0 @@ -241,10 +241,10 @@ define @umulo_nxv1i32( %x, , } @llvm.umul.with.overflow.nxv1i32( %x, %y) %b = extractvalue { , } %a, 0 @@ -259,10 +259,10 @@ define @umulo_nxv2i32( %x, , } @llvm.umul.with.overflow.nxv2i32( %x, %y) %b = extractvalue { , } %a, 0 @@ -277,10 +277,10 @@ define @umulo_nxv4i32( %x, , } @llvm.umul.with.overflow.nxv4i32( %x, %y) %b = extractvalue { , } %a, 0 @@ -295,10 +295,10 @@ define @umulo_nxv8i32( %x, , } @llvm.umul.with.overflow.nxv8i32( %x, %y) %b = extractvalue { , } %a, 0 @@ -313,10 +313,10 @@ define @umulo_nxv16i32( %x, , } @llvm.umul.with.overflow.nxv16i32( %x, %y) %b = extractvalue { , } %a, 0 @@ -331,10 +331,10 @@ define @umulo_nxv1i64( %x, , } @llvm.umul.with.overflow.nxv1i64( %x, %y) %b = extractvalue { , } %a, 0 @@ -349,10 +349,10 @@ define @umulo_nxv2i64( %x, , } @llvm.umul.with.overflow.nxv2i64( %x, %y) %b = extractvalue { , } %a, 0 @@ -367,10 +367,10 @@ define @umulo_nxv4i64( %x, , } @llvm.umul.with.overflow.nxv4i64( %x, %y) %b = extractvalue { , } %a, 0 @@ -385,10 +385,10 @@ define @umulo_nxv8i64( %x, , } @llvm.umul.with.overflow.nxv8i64( %x, %y) %b = extractvalue { , } %a, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll b/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll index 0bd82e654e021..2c89e939940b7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll @@ -10,13 +10,13 @@ define @test_urem_vec_even_divisor_eq0( %x) ; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; RV32-NEXT: vmul.vx v8, v8, a0 ; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, -1366 ; RV32-NEXT: vsll.vi v9, v8, 15 ; RV32-NEXT: vsrl.vi v8, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: addi a0, a0, -1366 ; RV32-NEXT: vmsgtu.vx v0, v8, a0 -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vim v8, v8, -1, v0 +; RV32-NEXT: vmerge.vim v8, v9, -1, v0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_vec_even_divisor_eq0: @@ -26,13 +26,13 @@ define @test_urem_vec_even_divisor_eq0( %x) ; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; RV64-NEXT: vmul.vx v8, v8, a0 ; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addi a0, a0, -1366 ; RV64-NEXT: vsll.vi v9, v8, 15 ; RV64-NEXT: vsrl.vi v8, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: addi a0, a0, -1366 ; RV64-NEXT: vmsgtu.vx v0, v8, a0 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vim v8, v8, -1, v0 +; RV64-NEXT: vmerge.vim v8, v9, -1, v0 ; RV64-NEXT: ret %urem = urem %x, splat (i16 6) %cmp = icmp ne %urem, splat (i16 0) @@ -48,10 +48,10 @@ define @test_urem_vec_odd_divisor_eq0( %x) ; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; RV32-NEXT: vmul.vx v8, v8, a0 ; RV32-NEXT: lui a0, 3 +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: addi a0, a0, 819 ; RV32-NEXT: vmsgtu.vx v0, v8, a0 -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vim v8, v8, -1, v0 +; RV32-NEXT: vmerge.vim v8, v9, -1, v0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_vec_odd_divisor_eq0: @@ -61,10 +61,10 @@ define @test_urem_vec_odd_divisor_eq0( %x) ; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; RV64-NEXT: vmul.vx v8, v8, a0 ; RV64-NEXT: lui a0, 3 +; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: addi a0, a0, 819 ; RV64-NEXT: vmsgtu.vx v0, v8, a0 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vim v8, v8, -1, v0 +; RV64-NEXT: vmerge.vim v8, v9, -1, v0 ; RV64-NEXT: ret %urem = urem %x, splat (i16 5) %cmp = icmp ne %urem, splat (i16 0) @@ -82,13 +82,13 @@ define @test_urem_vec_even_divisor_eq1( %x) ; RV32-NEXT: addi a0, a0, -1365 ; RV32-NEXT: vmul.vx v8, v8, a0 ; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, -1366 ; RV32-NEXT: vsll.vi v9, v8, 15 ; RV32-NEXT: vsrl.vi v8, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: addi a0, a0, -1366 ; RV32-NEXT: vmsgtu.vx v0, v8, a0 -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vim v8, v8, -1, v0 +; RV32-NEXT: vmerge.vim v8, v9, -1, v0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_vec_even_divisor_eq1: @@ -100,13 +100,13 @@ define @test_urem_vec_even_divisor_eq1( %x) ; RV64-NEXT: addi a0, a0, -1365 ; RV64-NEXT: vmul.vx v8, v8, a0 ; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addi a0, a0, -1366 ; RV64-NEXT: vsll.vi v9, v8, 15 ; RV64-NEXT: vsrl.vi v8, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: addi a0, a0, -1366 ; RV64-NEXT: vmsgtu.vx v0, v8, a0 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vim v8, v8, -1, v0 +; RV64-NEXT: vmerge.vim v8, v9, -1, v0 ; RV64-NEXT: ret %urem = urem %x, splat (i16 6) %cmp = icmp ne %urem, splat (i16 1) @@ -124,10 +124,10 @@ define @test_urem_vec_odd_divisor_eq1( %x) ; RV32-NEXT: addi a0, a0, -819 ; RV32-NEXT: vmul.vx v8, v8, a0 ; RV32-NEXT: lui a0, 3 +; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: addi a0, a0, 818 ; RV32-NEXT: vmsgtu.vx v0, v8, a0 -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vim v8, v8, -1, v0 +; RV32-NEXT: vmerge.vim v8, v9, -1, v0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_vec_odd_divisor_eq1: @@ -139,10 +139,10 @@ define @test_urem_vec_odd_divisor_eq1( %x) ; RV64-NEXT: addi a0, a0, -819 ; RV64-NEXT: vmul.vx v8, v8, a0 ; RV64-NEXT: lui a0, 3 +; RV64-NEXT: vmv.v.i v9, 0 ; RV64-NEXT: addi a0, a0, 818 ; RV64-NEXT: vmsgtu.vx v0, v8, a0 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vim v8, v8, -1, v0 +; RV64-NEXT: vmerge.vim v8, v9, -1, v0 ; RV64-NEXT: ret %urem = urem %x, splat (i16 5) %cmp = icmp ne %urem, splat (i16 1) diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll index 77f3cf3ca4980..cd1609f90c6b7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll @@ -1442,12 +1442,11 @@ define @vadd_vi_nxv32i32_evl_nx16( %va, < ; RV64-LABEL: vadd_vi_nxv32i32_evl_nx16: ; RV64: # %bb.0: ; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV64-NEXT: vadd.vi v8, v8, -1, v0.t ; RV64-NEXT: srli a0, a0, 2 ; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v24, v0, a0 -; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; RV64-NEXT: vadd.vi v8, v8, -1, v0.t -; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vslidedown.vx v0, v0, a0 ; RV64-NEXT: vsetivli zero, 0, e32, m8, ta, ma ; RV64-NEXT: vadd.vi v16, v16, -1, v0.t ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 9f0b2b3914836..6e9826b2fcdb3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -204,19 +204,19 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v14, v8, 8 ; CHECK-NEXT: srli a1, a0, 3 -; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: add a2, a1, a1 ; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma ; CHECK-NEXT: vslideup.vx v11, v10, a1 ; CHECK-NEXT: vslideup.vx v8, v12, a1 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a2, a0, a0 +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vx v8, v11, a0 -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vmv1r.v v9, v14 -; CHECK-NEXT: vs2r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v8, (a0) +; CHECK-NEXT: vs2r.v v8, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vlseg5e16.v v8, (a1) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 @@ -576,19 +576,19 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_dein ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v14, v8, 8 ; CHECK-NEXT: srli a1, a0, 3 -; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: add a2, a1, a1 ; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma ; CHECK-NEXT: vslideup.vx v11, v10, a1 ; CHECK-NEXT: vslideup.vx v8, v12, a1 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a2, a0, a0 +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vx v8, v11, a0 -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vmv1r.v v9, v14 -; CHECK-NEXT: vs2r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v8, (a0) +; CHECK-NEXT: vs2r.v v8, (a1) +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vlseg5e16.v v8, (a1) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index 14f306da21dba..55359e82e9720 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -12,11 +12,12 @@ define {, } @vector_deinterleave_load_nxv16i ; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; CHECK-NEXT: vlm.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 @@ -134,44 +135,62 @@ define {, } @vector_deinterleave_load_nxv8i6 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a2, a1, 4 +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb ; CHECK-NEXT: li a1, 85 ; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v16, a1 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: vl8re64.v v24, (a0) +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: li a1, 170 ; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: vmv.v.x v17, a1 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vcompress.vm v8, v24, v16 ; CHECK-NEXT: vmv1r.v v12, v16 -; CHECK-NEXT: vmv1r.v v13, v17 -; CHECK-NEXT: vcompress.vm v16, v24, v13 -; CHECK-NEXT: vcompress.vm v24, v0, v12 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vcompress.vm v24, v0, v13 +; CHECK-NEXT: vcompress.vm v16, v0, v12 +; CHECK-NEXT: vmv4r.v v12, v16 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v16, a1 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v12, v24 +; CHECK-NEXT: vs1r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v20, v24 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vcompress.vm v16, v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v20, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vcompress.vm v24, v0, v20 +; CHECK-NEXT: vmv4r.v v20, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 4 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index 81b6de9e662d5..ea1a6fe03501b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -10,15 +10,14 @@ define {, } @vector_deinterleave_nxv16i1_nxv ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vmerge.vim v12, v8, 1, v0 ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v0, v0, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 +; CHECK-NEXT: vmerge.vim v14, v8, 1, v0 ; CHECK-NEXT: vnsrl.wi v8, v12, 0 ; CHECK-NEXT: vnsrl.wi v10, v12, 8 ; CHECK-NEXT: vmsne.vi v0, v8, 0 @@ -179,41 +178,63 @@ define {, } @vector_deinterleave_nxv8i64_nxv ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a1, a0, 4 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v7, a0 ; CHECK-NEXT: li a0, 170 -; CHECK-NEXT: vmv.v.x v6, a0 -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; CHECK-NEXT: vcompress.vm v24, v8, v7 ; CHECK-NEXT: vmv1r.v v28, v7 -; CHECK-NEXT: vmv1r.v v29, v6 -; CHECK-NEXT: vcompress.vm v0, v8, v29 -; CHECK-NEXT: vcompress.vm v8, v16, v28 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vcompress.vm v8, v16, v29 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vcompress.vm v0, v16, v28 +; CHECK-NEXT: vmv4r.v v28, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v7, a0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v28, v8 +; CHECK-NEXT: vs1r.v v7, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vcompress.vm v0, v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vcompress.vm v8, v16, v24 ; CHECK-NEXT: vmv4r.v v4, v8 -; CHECK-NEXT: vmv8r.v v8, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a1, a0, 4 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -401,41 +422,63 @@ define {, } @vector_deinterleave_nxv8f ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a1, a0, 4 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v7, a0 ; CHECK-NEXT: li a0, 170 -; CHECK-NEXT: vmv.v.x v6, a0 -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; CHECK-NEXT: vcompress.vm v24, v8, v7 ; CHECK-NEXT: vmv1r.v v28, v7 -; CHECK-NEXT: vmv1r.v v29, v6 -; CHECK-NEXT: vcompress.vm v0, v8, v29 -; CHECK-NEXT: vcompress.vm v8, v16, v28 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vcompress.vm v8, v16, v29 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vcompress.vm v0, v16, v28 +; CHECK-NEXT: vmv4r.v v28, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v7, a0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v28, v8 +; CHECK-NEXT: vs1r.v v7, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vcompress.vm v0, v8, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vcompress.vm v8, v16, v24 ; CHECK-NEXT: vmv4r.v v4, v8 -; CHECK-NEXT: vmv8r.v v8, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a1, a0, 4 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -578,41 +621,41 @@ define {, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , %data, <16 x i8> %mask, i8 %passthru) { ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vcpop.m a1, v0 ; CHECK-NEXT: vid.v v9, v0.t +; CHECK-NEXT: vcpop.m a1, v0 ; CHECK-NEXT: beqz a1, .LBB0_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: vredmaxu.vs v9, v9, v9 @@ -31,8 +31,8 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { ; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vcpop.m a1, v0 ; CHECK-NEXT: vid.v v9, v0.t +; CHECK-NEXT: vcpop.m a1, v0 ; CHECK-NEXT: beqz a1, .LBB1_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: vredmaxu.vs v9, v9, v9 @@ -55,8 +55,8 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { ; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vcpop.m a1, v0 ; CHECK-NEXT: vid.v v9, v0.t +; CHECK-NEXT: vcpop.m a1, v0 ; CHECK-NEXT: beqz a1, .LBB2_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: vredmaxu.vs v9, v9, v9 @@ -79,8 +79,8 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; RV32-NEXT: vmsne.vi v0, v9, 0 ; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: vcpop.m a2, v0 ; RV32-NEXT: vid.v v9, v0.t +; RV32-NEXT: vcpop.m a2, v0 ; RV32-NEXT: beqz a2, .LBB3_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: vredmaxu.vs v9, v9, v9 @@ -102,8 +102,8 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; RV64-NEXT: vmsne.vi v0, v9, 0 ; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; RV64-NEXT: vmv.v.i v9, 0 -; RV64-NEXT: vcpop.m a1, v0 ; RV64-NEXT: vid.v v9, v0.t +; RV64-NEXT: vcpop.m a1, v0 ; RV64-NEXT: beqz a1, .LBB3_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: vredmaxu.vs v9, v9, v9 @@ -126,8 +126,8 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass ; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vcpop.m a0, v0 ; CHECK-NEXT: vid.v v9, v0.t +; CHECK-NEXT: vcpop.m a0, v0 ; CHECK-NEXT: beqz a0, .LBB4_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: vredmaxu.vs v9, v9, v9 @@ -150,8 +150,8 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % ; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vcpop.m a0, v0 ; CHECK-NEXT: vid.v v9, v0.t +; CHECK-NEXT: vcpop.m a0, v0 ; CHECK-NEXT: beqz a0, .LBB5_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: vredmaxu.vs v9, v9, v9 @@ -172,8 +172,8 @@ define i8 @extract_last_i8_scalable( %data, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu ; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vcpop.m a1, v0 ; CHECK-NEXT: vid.v v10, v0.t +; CHECK-NEXT: vcpop.m a1, v0 ; CHECK-NEXT: beqz a1, .LBB6_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: vredmaxu.vs v10, v10, v10 @@ -193,8 +193,8 @@ define i16 @extract_last_i16_scalable( %data, %data, %data, %data, %data, %data, @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma ; CHECK-NEXT: vsseg5e16.v v8, (a0) ; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: add a1, a4, a1 ; CHECK-NEXT: vle16.v v9, (a2) ; CHECK-NEXT: vle16.v v10, (a4) ; CHECK-NEXT: vle16.v v11, (a3) ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: add a1, a4, a1 ; CHECK-NEXT: vle16.v v12, (a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vslideup.vi v11, v10, 2 @@ -241,11 +241,11 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma ; ZVBB-NEXT: vsseg5e16.v v8, (a0) ; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: add a1, a4, a1 ; ZVBB-NEXT: vle16.v v9, (a2) ; ZVBB-NEXT: vle16.v v10, (a4) ; ZVBB-NEXT: vle16.v v11, (a3) ; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: add a1, a4, a1 ; ZVBB-NEXT: vle16.v v12, (a1) ; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVBB-NEXT: vslideup.vi v11, v10, 2 @@ -283,24 +283,24 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; CHECK-NEXT: vsseg7e8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a4) ; CHECK-NEXT: add a4, a4, a1 -; CHECK-NEXT: vle8.v v10, (a2) -; CHECK-NEXT: add a2, a4, a1 -; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vle8.v v10, (a3) +; CHECK-NEXT: add a3, a4, a1 ; CHECK-NEXT: vle8.v v11, (a2) -; CHECK-NEXT: vle8.v v12, (a4) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vle8.v v12, (a3) +; CHECK-NEXT: vle8.v v13, (a4) ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vle8.v v13, (a1) -; CHECK-NEXT: vle8.v v14, (a3) +; CHECK-NEXT: vle8.v v14, (a1) ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v12, v11, 2 -; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vslideup.vi v13, v12, 2 +; CHECK-NEXT: vslideup.vi v8, v11, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v12, v13, 4 -; CHECK-NEXT: vslideup.vi v8, v14, 4 +; CHECK-NEXT: vslideup.vi v13, v14, 4 +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v9, 6 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vslideup.vi v8, v13, 8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 @@ -325,24 +325,24 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; ZVBB-NEXT: vsseg7e8.v v8, (a0) ; ZVBB-NEXT: vle8.v v9, (a4) ; ZVBB-NEXT: add a4, a4, a1 -; ZVBB-NEXT: vle8.v v10, (a2) -; ZVBB-NEXT: add a2, a4, a1 -; ZVBB-NEXT: add a1, a2, a1 +; ZVBB-NEXT: vle8.v v10, (a3) +; ZVBB-NEXT: add a3, a4, a1 ; ZVBB-NEXT: vle8.v v11, (a2) -; ZVBB-NEXT: vle8.v v12, (a4) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vle8.v v12, (a3) +; ZVBB-NEXT: vle8.v v13, (a4) ; ZVBB-NEXT: vle8.v v8, (a0) -; ZVBB-NEXT: vle8.v v13, (a1) -; ZVBB-NEXT: vle8.v v14, (a3) +; ZVBB-NEXT: vle8.v v14, (a1) ; ZVBB-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v12, v11, 2 -; ZVBB-NEXT: vslideup.vi v8, v10, 2 +; ZVBB-NEXT: vslideup.vi v13, v12, 2 +; ZVBB-NEXT: vslideup.vi v8, v11, 2 ; ZVBB-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v12, v13, 4 -; ZVBB-NEXT: vslideup.vi v8, v14, 4 +; ZVBB-NEXT: vslideup.vi v13, v14, 4 +; ZVBB-NEXT: vslideup.vi v8, v10, 4 ; ZVBB-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; ZVBB-NEXT: vslideup.vi v8, v9, 6 ; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v12, 8 +; ZVBB-NEXT: vslideup.vi v8, v13, 8 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: .cfi_def_cfa sp, 16 @@ -579,11 +579,11 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b ; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma ; CHECK-NEXT: vsseg5e16.v v8, (a0) ; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: add a1, a4, a1 ; CHECK-NEXT: vle16.v v9, (a2) ; CHECK-NEXT: vle16.v v10, (a4) ; CHECK-NEXT: vle16.v v11, (a3) ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: add a1, a4, a1 ; CHECK-NEXT: vle16.v v12, (a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vslideup.vi v11, v10, 2 @@ -616,11 +616,11 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b ; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma ; ZVBB-NEXT: vsseg5e16.v v8, (a0) ; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: add a1, a4, a1 ; ZVBB-NEXT: vle16.v v9, (a2) ; ZVBB-NEXT: vle16.v v10, (a4) ; ZVBB-NEXT: vle16.v v11, (a3) ; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: add a1, a4, a1 ; ZVBB-NEXT: vle16.v v12, (a1) ; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVBB-NEXT: vslideup.vi v11, v10, 2 @@ -659,24 +659,24 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, ; CHECK-NEXT: vsseg7e16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a4) ; CHECK-NEXT: add a4, a4, a1 -; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: add a2, a4, a1 -; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vle16.v v10, (a3) +; CHECK-NEXT: add a3, a4, a1 ; CHECK-NEXT: vle16.v v11, (a2) -; CHECK-NEXT: vle16.v v12, (a4) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vle16.v v12, (a3) +; CHECK-NEXT: vle16.v v13, (a4) ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vle16.v v13, (a1) -; CHECK-NEXT: vle16.v v14, (a3) +; CHECK-NEXT: vle16.v v14, (a1) ; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v12, v11, 1 -; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vslideup.vi v13, v12, 1 +; CHECK-NEXT: vslideup.vi v8, v11, 1 ; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v12, v13, 2 -; CHECK-NEXT: vslideup.vi v8, v14, 2 +; CHECK-NEXT: vslideup.vi v13, v14, 2 +; CHECK-NEXT: vslideup.vi v8, v10, 2 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v9, 3 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 4 +; CHECK-NEXT: vslideup.vi v8, v13, 4 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 @@ -703,24 +703,24 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, ; ZVBB-NEXT: vsseg7e16.v v8, (a0) ; ZVBB-NEXT: vle16.v v9, (a4) ; ZVBB-NEXT: add a4, a4, a1 -; ZVBB-NEXT: vle16.v v10, (a2) -; ZVBB-NEXT: add a2, a4, a1 -; ZVBB-NEXT: add a1, a2, a1 +; ZVBB-NEXT: vle16.v v10, (a3) +; ZVBB-NEXT: add a3, a4, a1 ; ZVBB-NEXT: vle16.v v11, (a2) -; ZVBB-NEXT: vle16.v v12, (a4) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vle16.v v12, (a3) +; ZVBB-NEXT: vle16.v v13, (a4) ; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: vle16.v v13, (a1) -; ZVBB-NEXT: vle16.v v14, (a3) +; ZVBB-NEXT: vle16.v v14, (a1) ; ZVBB-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v12, v11, 1 -; ZVBB-NEXT: vslideup.vi v8, v10, 1 +; ZVBB-NEXT: vslideup.vi v13, v12, 1 +; ZVBB-NEXT: vslideup.vi v8, v11, 1 ; ZVBB-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v12, v13, 2 -; ZVBB-NEXT: vslideup.vi v8, v14, 2 +; ZVBB-NEXT: vslideup.vi v13, v14, 2 +; ZVBB-NEXT: vslideup.vi v8, v10, 2 ; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVBB-NEXT: vslideup.vi v8, v9, 3 ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v12, 4 +; ZVBB-NEXT: vslideup.vi v8, v13, 4 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll index 6aa62c2256925..53ec22f361254 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -124,9 +124,9 @@ define @vector_interleave_nxv4i64_nxv2i64( ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu ; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: vand.vi v13, v12, 1 -; CHECK-NEXT: vmsne.vi v0, v13, 0 ; CHECK-NEXT: vsrl.vi v16, v12, 1 +; CHECK-NEXT: vand.vi v12, v12, 1 +; CHECK-NEXT: vmsne.vi v0, v12, 0 ; CHECK-NEXT: vadd.vx v16, v16, a0, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 @@ -139,9 +139,9 @@ define @vector_interleave_nxv4i64_nxv2i64( ; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, mu ; ZVBB-NEXT: vid.v v12 ; ZVBB-NEXT: srli a0, a0, 2 -; ZVBB-NEXT: vand.vi v13, v12, 1 -; ZVBB-NEXT: vmsne.vi v0, v13, 0 ; ZVBB-NEXT: vsrl.vi v16, v12, 1 +; ZVBB-NEXT: vand.vi v12, v12, 1 +; ZVBB-NEXT: vmsne.vi v0, v12, 0 ; ZVBB-NEXT: vadd.vx v16, v16, a0, v0.t ; ZVBB-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; ZVBB-NEXT: vrgatherei16.vv v12, v8, v16 @@ -287,13 +287,13 @@ define @vector_interleave_nxv16i64_nxv8i64( @vector_interleave_nxv16i64_nxv8i64( @llvm.vector.interleave2.nxv16i64( %a, %b) @@ -527,9 +527,9 @@ define @vector_interleave_nxv4f64_nxv2f64( @vector_interleave_nxv4f64_nxv2f64( @vector_interleave_nxv16f64_nxv8f64( @vector_interleave_nxv16f64_nxv8f64( @llvm.vector.interleave2.nxv16f64( %a, %b) @@ -745,12 +745,12 @@ define @vector_interleave_nxv48i1_nxv16i1( ; CHECK-NEXT: srli a2, a1, 2 ; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: vl2r.v v10, (a3) +; CHECK-NEXT: add a3, a2, a2 ; CHECK-NEXT: vl2r.v v12, (a0) -; CHECK-NEXT: add a0, a2, a2 ; CHECK-NEXT: vmsne.vi v14, v8, 0 ; CHECK-NEXT: vmsne.vi v8, v10, 0 ; CHECK-NEXT: vmsne.vi v0, v12, 0 -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma ; CHECK-NEXT: vslideup.vx v0, v8, a2 ; CHECK-NEXT: add a0, a1, a1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma @@ -788,12 +788,12 @@ define @vector_interleave_nxv48i1_nxv16i1( ; ZVBB-NEXT: srli a2, a1, 2 ; ZVBB-NEXT: srli a1, a1, 1 ; ZVBB-NEXT: vl2r.v v10, (a3) +; ZVBB-NEXT: add a3, a2, a2 ; ZVBB-NEXT: vl2r.v v12, (a0) -; ZVBB-NEXT: add a0, a2, a2 ; ZVBB-NEXT: vmsne.vi v14, v8, 0 ; ZVBB-NEXT: vmsne.vi v8, v10, 0 ; ZVBB-NEXT: vmsne.vi v0, v12, 0 -; ZVBB-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; ZVBB-NEXT: vsetvli zero, a3, e8, mf2, ta, ma ; ZVBB-NEXT: vslideup.vx v0, v8, a2 ; ZVBB-NEXT: add a0, a1, a1 ; ZVBB-NEXT: vsetvli zero, a0, e8, m1, ta, ma @@ -1045,12 +1045,12 @@ define @vector_interleave_nxv80i1_nxv16i1( ; CHECK-NEXT: vmv2r.v v20, v14 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 +; CHECK-NEXT: add a5, a2, a1 ; CHECK-NEXT: vmv1r.v v21, v18 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 ; CHECK-NEXT: vmv1r.v v22, v16 ; CHECK-NEXT: vmv1r.v v16, v19 -; CHECK-NEXT: add a5, a2, a1 ; CHECK-NEXT: vmv1r.v v23, v8 ; CHECK-NEXT: vmv1r.v v18, v9 ; CHECK-NEXT: vmv1r.v v0, v11 @@ -1121,12 +1121,12 @@ define @vector_interleave_nxv80i1_nxv16i1( ; ZVBB-NEXT: vmv2r.v v20, v14 ; ZVBB-NEXT: vmv1r.v v0, v9 ; ZVBB-NEXT: vmerge.vim v16, v12, 1, v0 +; ZVBB-NEXT: add a5, a2, a1 ; ZVBB-NEXT: vmv1r.v v21, v18 ; ZVBB-NEXT: vmv1r.v v0, v10 ; ZVBB-NEXT: vmerge.vim v8, v12, 1, v0 ; ZVBB-NEXT: vmv1r.v v22, v16 ; ZVBB-NEXT: vmv1r.v v16, v19 -; ZVBB-NEXT: add a5, a2, a1 ; ZVBB-NEXT: vmv1r.v v23, v8 ; ZVBB-NEXT: vmv1r.v v18, v9 ; ZVBB-NEXT: vmv1r.v v0, v11 @@ -1192,26 +1192,26 @@ define @vector_interleave_nxv80i8_nxv16i8( ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-NEXT: vmv2r.v v20, v16 -; RV32-NEXT: addi a0, sp, 64 ; RV32-NEXT: vmv2r.v v18, v12 +; RV32-NEXT: vmv2r.v v16, v8 +; RV32-NEXT: addi a0, sp, 64 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a2, a1, 2 ; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vmv2r.v v16, v8 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: add a4, a1, a2 ; RV32-NEXT: vmv2r.v v22, v16 ; RV32-NEXT: vmv2r.v v24, v18 ; RV32-NEXT: vmv1r.v v26, v20 -; RV32-NEXT: add a3, a0, a2 -; RV32-NEXT: vmv1r.v v23, v10 -; RV32-NEXT: add a4, a1, a2 ; RV32-NEXT: add a5, a4, a2 -; RV32-NEXT: vmv1r.v v25, v14 +; RV32-NEXT: vmv1r.v v23, v10 ; RV32-NEXT: add a6, a5, a2 -; RV32-NEXT: vmv1r.v v18, v11 +; RV32-NEXT: vmv1r.v v25, v14 ; RV32-NEXT: vsseg5e8.v v22, (a0) +; RV32-NEXT: vmv1r.v v18, v11 ; RV32-NEXT: vmv1r.v v20, v15 ; RV32-NEXT: vsseg5e8.v v17, (a1) ; RV32-NEXT: vl1r.v v16, (a6) @@ -1230,10 +1230,10 @@ define @vector_interleave_nxv80i8_nxv16i8( ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 64 ; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: vl1r.v v15, (a5) ; RV32-NEXT: vl1r.v v12, (a6) ; RV32-NEXT: vl1r.v v13, (a1) -; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: vs2r.v v16, (a2) ; RV32-NEXT: vs8r.v v8, (a0) @@ -1258,26 +1258,26 @@ define @vector_interleave_nxv80i8_nxv16i8( ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-NEXT: vmv2r.v v20, v16 -; RV64-NEXT: addi a0, sp, 64 ; RV64-NEXT: vmv2r.v v18, v12 +; RV64-NEXT: vmv2r.v v16, v8 +; RV64-NEXT: addi a0, sp, 64 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a2, a1, 2 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 64 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: vmv2r.v v16, v8 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: add a4, a1, a2 ; RV64-NEXT: vmv2r.v v22, v16 ; RV64-NEXT: vmv2r.v v24, v18 ; RV64-NEXT: vmv1r.v v26, v20 -; RV64-NEXT: add a3, a0, a2 -; RV64-NEXT: vmv1r.v v23, v10 -; RV64-NEXT: add a4, a1, a2 ; RV64-NEXT: add a5, a4, a2 -; RV64-NEXT: vmv1r.v v25, v14 +; RV64-NEXT: vmv1r.v v23, v10 ; RV64-NEXT: add a6, a5, a2 -; RV64-NEXT: vmv1r.v v18, v11 +; RV64-NEXT: vmv1r.v v25, v14 ; RV64-NEXT: vsseg5e8.v v22, (a0) +; RV64-NEXT: vmv1r.v v18, v11 ; RV64-NEXT: vmv1r.v v20, v15 ; RV64-NEXT: vsseg5e8.v v17, (a1) ; RV64-NEXT: vl1r.v v16, (a6) @@ -1296,10 +1296,10 @@ define @vector_interleave_nxv80i8_nxv16i8( ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 64 ; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: vl1r.v v15, (a5) ; RV64-NEXT: vl1r.v v12, (a6) ; RV64-NEXT: vl1r.v v13, (a1) -; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: add a2, a0, a2 ; RV64-NEXT: vs2r.v v16, (a2) ; RV64-NEXT: vs8r.v v8, (a0) @@ -1324,26 +1324,26 @@ define @vector_interleave_nxv80i8_nxv16i8( ; ZVBB-RV32-NEXT: andi sp, sp, -64 ; ZVBB-RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; ZVBB-RV32-NEXT: vmv2r.v v20, v16 -; ZVBB-RV32-NEXT: addi a0, sp, 64 ; ZVBB-RV32-NEXT: vmv2r.v v18, v12 +; ZVBB-RV32-NEXT: vmv2r.v v16, v8 +; ZVBB-RV32-NEXT: addi a0, sp, 64 ; ZVBB-RV32-NEXT: csrr a1, vlenb ; ZVBB-RV32-NEXT: slli a2, a1, 2 ; ZVBB-RV32-NEXT: add a1, a2, a1 ; ZVBB-RV32-NEXT: add a1, sp, a1 ; ZVBB-RV32-NEXT: addi a1, a1, 64 ; ZVBB-RV32-NEXT: csrr a2, vlenb -; ZVBB-RV32-NEXT: vmv2r.v v16, v8 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: add a4, a1, a2 ; ZVBB-RV32-NEXT: vmv2r.v v22, v16 ; ZVBB-RV32-NEXT: vmv2r.v v24, v18 ; ZVBB-RV32-NEXT: vmv1r.v v26, v20 -; ZVBB-RV32-NEXT: add a3, a0, a2 -; ZVBB-RV32-NEXT: vmv1r.v v23, v10 -; ZVBB-RV32-NEXT: add a4, a1, a2 ; ZVBB-RV32-NEXT: add a5, a4, a2 -; ZVBB-RV32-NEXT: vmv1r.v v25, v14 +; ZVBB-RV32-NEXT: vmv1r.v v23, v10 ; ZVBB-RV32-NEXT: add a6, a5, a2 -; ZVBB-RV32-NEXT: vmv1r.v v18, v11 +; ZVBB-RV32-NEXT: vmv1r.v v25, v14 ; ZVBB-RV32-NEXT: vsseg5e8.v v22, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v18, v11 ; ZVBB-RV32-NEXT: vmv1r.v v20, v15 ; ZVBB-RV32-NEXT: vsseg5e8.v v17, (a1) ; ZVBB-RV32-NEXT: vl1r.v v16, (a6) @@ -1362,10 +1362,10 @@ define @vector_interleave_nxv80i8_nxv16i8( ; ZVBB-RV32-NEXT: add a0, sp, a0 ; ZVBB-RV32-NEXT: addi a0, a0, 64 ; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: slli a2, a2, 3 ; ZVBB-RV32-NEXT: vl1r.v v15, (a5) ; ZVBB-RV32-NEXT: vl1r.v v12, (a6) ; ZVBB-RV32-NEXT: vl1r.v v13, (a1) -; ZVBB-RV32-NEXT: slli a2, a2, 3 ; ZVBB-RV32-NEXT: add a2, a0, a2 ; ZVBB-RV32-NEXT: vs2r.v v16, (a2) ; ZVBB-RV32-NEXT: vs8r.v v8, (a0) @@ -1390,26 +1390,26 @@ define @vector_interleave_nxv80i8_nxv16i8( ; ZVBB-RV64-NEXT: andi sp, sp, -64 ; ZVBB-RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; ZVBB-RV64-NEXT: vmv2r.v v20, v16 -; ZVBB-RV64-NEXT: addi a0, sp, 64 ; ZVBB-RV64-NEXT: vmv2r.v v18, v12 +; ZVBB-RV64-NEXT: vmv2r.v v16, v8 +; ZVBB-RV64-NEXT: addi a0, sp, 64 ; ZVBB-RV64-NEXT: csrr a1, vlenb ; ZVBB-RV64-NEXT: slli a2, a1, 2 ; ZVBB-RV64-NEXT: add a1, a2, a1 ; ZVBB-RV64-NEXT: add a1, sp, a1 ; ZVBB-RV64-NEXT: addi a1, a1, 64 ; ZVBB-RV64-NEXT: csrr a2, vlenb -; ZVBB-RV64-NEXT: vmv2r.v v16, v8 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: add a4, a1, a2 ; ZVBB-RV64-NEXT: vmv2r.v v22, v16 ; ZVBB-RV64-NEXT: vmv2r.v v24, v18 ; ZVBB-RV64-NEXT: vmv1r.v v26, v20 -; ZVBB-RV64-NEXT: add a3, a0, a2 -; ZVBB-RV64-NEXT: vmv1r.v v23, v10 -; ZVBB-RV64-NEXT: add a4, a1, a2 ; ZVBB-RV64-NEXT: add a5, a4, a2 -; ZVBB-RV64-NEXT: vmv1r.v v25, v14 +; ZVBB-RV64-NEXT: vmv1r.v v23, v10 ; ZVBB-RV64-NEXT: add a6, a5, a2 -; ZVBB-RV64-NEXT: vmv1r.v v18, v11 +; ZVBB-RV64-NEXT: vmv1r.v v25, v14 ; ZVBB-RV64-NEXT: vsseg5e8.v v22, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v18, v11 ; ZVBB-RV64-NEXT: vmv1r.v v20, v15 ; ZVBB-RV64-NEXT: vsseg5e8.v v17, (a1) ; ZVBB-RV64-NEXT: vl1r.v v16, (a6) @@ -1428,10 +1428,10 @@ define @vector_interleave_nxv80i8_nxv16i8( ; ZVBB-RV64-NEXT: add a0, sp, a0 ; ZVBB-RV64-NEXT: addi a0, a0, 64 ; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: slli a2, a2, 3 ; ZVBB-RV64-NEXT: vl1r.v v15, (a5) ; ZVBB-RV64-NEXT: vl1r.v v12, (a6) ; ZVBB-RV64-NEXT: vl1r.v v13, (a1) -; ZVBB-RV64-NEXT: slli a2, a2, 3 ; ZVBB-RV64-NEXT: add a2, a0, a2 ; ZVBB-RV64-NEXT: vs2r.v v16, (a2) ; ZVBB-RV64-NEXT: vs8r.v v8, (a0) @@ -1521,26 +1521,26 @@ define @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv112i1_nxv16i1( @vector_interleave_nxv112i1_nxv16i1( @vector_interleave_nxv112i1_nxv16i1( @vector_interleave_nxv112i1_nxv16i1( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @test2( %a, ; CHECK-NEXT: lui a1, %hi(.LCPI1_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI1_0)(a1) ; CHECK-NEXT: lui a1, %hi(.LCPI1_1) -; CHECK-NEXT: fld fa4, %lo(.LCPI1_1)(a1) -; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa5 +; CHECK-NEXT: fld fa5, %lo(.LCPI1_1)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfadd.vf v9, v9, fa4, v0.t +; CHECK-NEXT: vfadd.vf v9, v9, fa5, v0.t ; CHECK-NEXT: vfmul.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %t = call @llvm.vp.fmul.nxv1f64( %a, splat (double 2.0), %m, i32 %evl) @@ -48,11 +48,11 @@ define @test3( %a, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64_unmasked( %va, %b, %c, i32 zeroext %evl) { ; CHECK-LABEL: vfma_vv_nxv16f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: mv a3, a1 -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a1, a1, a3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 ; CHECK-NEXT: add a5, a2, a3 -; CHECK-NEXT: vl8re64.v v8, (a5) -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a5, a5, 3 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v0, (a5) ; CHECK-NEXT: sub a5, a4, a1 ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vl8re64.v v24, (a3) ; CHECK-NEXT: sltu a3, a4, a5 -; CHECK-NEXT: vl8re64.v v8, (a2) -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vl8re64.v v0, (a0) ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a3, a3, a5 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v24, v16, v8 +; CHECK-NEXT: vfmadd.vv v24, v16, v0 +; CHECK-NEXT: vl8re64.v v0, (a2) +; CHECK-NEXT: vl8re64.v v16, (a0) ; CHECK-NEXT: bltu a4, a1, .LBB129_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a1 ; CHECK-NEXT: .LBB129_2: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v16, v8 -; CHECK-NEXT: vmv.v.v v8, v0 +; CHECK-NEXT: vfmadd.vv v16, v8, v0 +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: vmv8r.v v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.fma.nxv16f64( %va, %b, %c, splat (i1 true), i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll index 901f3cd63fa9e..432994de33321 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll @@ -1108,20 +1108,15 @@ define @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64_unmasked( %va, %b, %c, i32 zeroext %evl) { ; CHECK-LABEL: vfma_vv_nxv16f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a3, 24 -; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 ; CHECK-NEXT: add a5, a2, a3 -; CHECK-NEXT: vl8re64.v v8, (a5) -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: slli a5, a5, 3 -; CHECK-NEXT: add a5, sp, a5 -; CHECK-NEXT: addi a5, a5, 16 -; CHECK-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill +; CHECK-NEXT: vl8re64.v v0, (a5) ; CHECK-NEXT: sub a5, a4, a1 ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vl8re64.v v24, (a3) ; CHECK-NEXT: sltu a3, a4, a5 -; CHECK-NEXT: vl8re64.v v8, (a2) -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vl8re64.v v0, (a0) ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a3, a3, a5 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v24, v16, v8 +; CHECK-NEXT: vfmadd.vv v24, v16, v0 +; CHECK-NEXT: vl8re64.v v0, (a2) +; CHECK-NEXT: vl8re64.v v16, (a0) ; CHECK-NEXT: bltu a4, a1, .LBB93_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a1 ; CHECK-NEXT: .LBB93_2: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v16, v8 -; CHECK-NEXT: vmv.v.v v8, v0 +; CHECK-NEXT: vfmadd.vv v16, v8, v0 +; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: vmv8r.v v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.fmuladd.nxv16f64( %va, %b, %c, splat (i1 true), i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll index 63156e1399293..6f4d2dd626bfb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll @@ -149,69 +149,68 @@ define @vfptrunc_nxv32f32_nxv32f64( ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a3, a1, 3 -; CHECK-NEXT: srli a5, a1, 2 -; CHECK-NEXT: slli a6, a1, 3 -; CHECK-NEXT: slli a4, a1, 1 -; CHECK-NEXT: vslidedown.vx v16, v0, a5 -; CHECK-NEXT: add a6, a0, a6 -; CHECK-NEXT: sub a5, a2, a4 -; CHECK-NEXT: vl8re64.v v24, (a6) -; CHECK-NEXT: sltu a6, a2, a5 +; CHECK-NEXT: srli a4, a1, 2 +; CHECK-NEXT: slli a5, a1, 3 +; CHECK-NEXT: slli a3, a1, 1 +; CHECK-NEXT: vslidedown.vx v16, v0, a4 +; CHECK-NEXT: add a5, a0, a5 +; CHECK-NEXT: sub a4, a2, a3 +; CHECK-NEXT: vl8re64.v v24, (a5) +; CHECK-NEXT: sltu a5, a2, a4 +; CHECK-NEXT: addi a5, a5, -1 +; CHECK-NEXT: and a4, a5, a4 +; CHECK-NEXT: sub a5, a4, a1 +; CHECK-NEXT: sltu a6, a4, a5 ; CHECK-NEXT: addi a6, a6, -1 -; CHECK-NEXT: and a5, a6, a5 -; CHECK-NEXT: sub a6, a5, a1 -; CHECK-NEXT: sltu a7, a5, a6 -; CHECK-NEXT: addi a7, a7, -1 -; CHECK-NEXT: vl8re64.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v16, a3 -; CHECK-NEXT: and a0, a7, a6 -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: and a6, a6, a5 +; CHECK-NEXT: srli a5, a1, 3 +; CHECK-NEXT: vsetvli a7, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v16, a5 +; CHECK-NEXT: vsetvli zero, a6, e32, m4, ta, ma ; CHECK-NEXT: vfncvt.f.f.w v20, v24, v0.t -; CHECK-NEXT: bltu a5, a1, .LBB8_2 +; CHECK-NEXT: vl8re64.v v24, (a0) +; CHECK-NEXT: bltu a4, a1, .LBB8_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a5, a1 +; CHECK-NEXT: mv a4, a1 ; CHECK-NEXT: .LBB8_2: ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v6, v7, a3 -; CHECK-NEXT: vsetvli zero, a5, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.f.f.w v16, v8, v0.t -; CHECK-NEXT: bltu a2, a4, .LBB8_4 +; CHECK-NEXT: vslidedown.vx v6, v7, a5 +; CHECK-NEXT: vsetvli zero, a4, e32, m4, ta, ma +; CHECK-NEXT: vfncvt.f.f.w v16, v24, v0.t +; CHECK-NEXT: bltu a2, a3, .LBB8_4 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: mv a2, a4 +; CHECK-NEXT: mv a2, a3 ; CHECK-NEXT: .LBB8_4: ; CHECK-NEXT: sub a0, a2, a1 ; CHECK-NEXT: sltu a3, a2, a0 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a0, a3, a0 ; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.f.f.w v28, v8, v0.t +; CHECK-NEXT: vfncvt.f.f.w v12, v24, v0.t ; CHECK-NEXT: bltu a2, a1, .LBB8_6 ; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: .LBB8_6: ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-NEXT: vfncvt.f.f.w v24, v8, v0.t -; CHECK-NEXT: vmv8r.v v8, v24 +; CHECK-NEXT: vfncvt.f.f.w v8, v24, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll index 4336b27eb134a..3ace3ccdf0ee4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll @@ -175,8 +175,8 @@ define @vfsqrt_vv_nxv32bf16( %va, < ; CHECK-NEXT: sub a3, a0, a1 ; CHECK-NEXT: sltu a4, a0, a3 ; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: vslidedown.vx v0, v0, a2 ; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: vslidedown.vx v0, v0, a2 ; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma @@ -210,9 +210,9 @@ define @vfsqrt_vv_nxv32bf16_unmasked( @vfsqrt_vv_nxv32f16( %va, @vfsqrt_vv_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: sub a3, a0, a1 ; ZVFHMIN-NEXT: sltu a4, a0, a3 ; ZVFHMIN-NEXT: addi a4, a4, -1 -; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v16, a2 ; ZVFHMIN-NEXT: and a3, a4, a3 +; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma +; ZVFHMIN-NEXT: vslidedown.vx v0, v16, a2 ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll index c6ee9e34dc207..8003d8fed58bc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll @@ -3109,9 +3109,9 @@ define @vmand_mm( %a, %b, ; NOVLOPT-NEXT: vmand.mm v8, v0, v8 ; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; NOVLOPT-NEXT: vmand.mm v0, v0, v8 -; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; NOVLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: ret ; ; VLOPT-LABEL: vmand_mm: @@ -3119,9 +3119,9 @@ define @vmand_mm( %a, %b, ; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; VLOPT-NEXT: vmand.mm v8, v0, v8 ; VLOPT-NEXT: vmand.mm v0, v0, v8 -; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; VLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: ret %1 = call @llvm.riscv.vmand.nxv1i1( %a, %b, iXLen -1) %2 = call @llvm.riscv.vmand.nxv1i1( %a, %1, iXLen %vl) @@ -3136,9 +3136,9 @@ define @vmnand_mm( %a, %b, ; NOVLOPT-NEXT: vmnand.mm v8, v0, v8 ; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; NOVLOPT-NEXT: vmand.mm v0, v0, v8 -; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; NOVLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: ret ; ; VLOPT-LABEL: vmnand_mm: @@ -3146,9 +3146,9 @@ define @vmnand_mm( %a, %b, ; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; VLOPT-NEXT: vmnand.mm v8, v0, v8 ; VLOPT-NEXT: vmand.mm v0, v0, v8 -; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; VLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: ret %1 = call @llvm.riscv.vmnand.nxv1i1( %a, %b, iXLen -1) %2 = call @llvm.riscv.vmand.nxv1i1( %a, %1, iXLen %vl) @@ -3163,9 +3163,9 @@ define @vmandn_mm( %a, %b, ; NOVLOPT-NEXT: vmandn.mm v8, v0, v8 ; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; NOVLOPT-NEXT: vmand.mm v0, v0, v8 -; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; NOVLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: ret ; ; VLOPT-LABEL: vmandn_mm: @@ -3173,9 +3173,9 @@ define @vmandn_mm( %a, %b, ; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; VLOPT-NEXT: vmandn.mm v8, v0, v8 ; VLOPT-NEXT: vmand.mm v0, v0, v8 -; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; VLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: ret %1 = call @llvm.riscv.vmandn.nxv1i1( %a, %b, iXLen -1) %2 = call @llvm.riscv.vmand.nxv1i1( %a, %1, iXLen %vl) @@ -3190,9 +3190,9 @@ define @vmxor_mm( %a, %b, ; NOVLOPT-NEXT: vmxor.mm v8, v0, v8 ; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; NOVLOPT-NEXT: vmand.mm v0, v0, v8 -; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; NOVLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: ret ; ; VLOPT-LABEL: vmxor_mm: @@ -3200,9 +3200,9 @@ define @vmxor_mm( %a, %b, ; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; VLOPT-NEXT: vmxor.mm v8, v0, v8 ; VLOPT-NEXT: vmand.mm v0, v0, v8 -; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; VLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: ret %1 = call @llvm.riscv.vmxor.nxv1i1( %a, %b, iXLen -1) %2 = call @llvm.riscv.vmand.nxv1i1( %a, %1, iXLen %vl) @@ -3217,9 +3217,9 @@ define @vmor_mm( %a, %b, < ; NOVLOPT-NEXT: vmor.mm v8, v0, v8 ; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; NOVLOPT-NEXT: vmand.mm v0, v0, v8 -; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; NOVLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: ret ; ; VLOPT-LABEL: vmor_mm: @@ -3227,9 +3227,9 @@ define @vmor_mm( %a, %b, < ; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; VLOPT-NEXT: vmor.mm v8, v0, v8 ; VLOPT-NEXT: vmand.mm v0, v0, v8 -; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; VLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: ret %1 = call @llvm.riscv.vmor.nxv1i1( %a, %b, iXLen -1) %2 = call @llvm.riscv.vmand.nxv1i1( %a, %1, iXLen %vl) @@ -3245,9 +3245,9 @@ define @vmnor_mm( %a, %b, ; NOVLOPT-NEXT: vmnor.mm v8, v0, v8 ; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; NOVLOPT-NEXT: vmand.mm v0, v0, v8 -; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; NOVLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: ret ; ; VLOPT-LABEL: vmnor_mm: @@ -3255,9 +3255,9 @@ define @vmnor_mm( %a, %b, ; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; VLOPT-NEXT: vmnor.mm v8, v0, v8 ; VLOPT-NEXT: vmand.mm v0, v0, v8 -; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; VLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: ret %1 = call @llvm.riscv.vmnor.nxv1i1( %a, %b, iXLen -1) %2 = call @llvm.riscv.vmand.nxv1i1( %a, %1, iXLen %vl) @@ -3272,9 +3272,9 @@ define @vmorn_mm( %a, %b, ; NOVLOPT-NEXT: vmorn.mm v8, v0, v8 ; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; NOVLOPT-NEXT: vmand.mm v0, v0, v8 -; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; NOVLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: ret ; ; VLOPT-LABEL: vmorn_mm: @@ -3282,9 +3282,9 @@ define @vmorn_mm( %a, %b, ; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; VLOPT-NEXT: vmorn.mm v8, v0, v8 ; VLOPT-NEXT: vmand.mm v0, v0, v8 -; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; VLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: ret %1 = call @llvm.riscv.vmorn.nxv1i1( %a, %b, iXLen -1) %2 = call @llvm.riscv.vmand.nxv1i1( %a, %1, iXLen %vl) @@ -3299,9 +3299,9 @@ define @vmxnor_mm( %a, %b, ; NOVLOPT-NEXT: vmxnor.mm v8, v0, v8 ; NOVLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; NOVLOPT-NEXT: vmand.mm v0, v0, v8 -; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; NOVLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; NOVLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; NOVLOPT-NEXT: vmv1r.v v8, v9 ; NOVLOPT-NEXT: ret ; ; VLOPT-LABEL: vmxnor_mm: @@ -3309,9 +3309,9 @@ define @vmxnor_mm( %a, %b, ; VLOPT-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; VLOPT-NEXT: vmxnor.mm v8, v0, v8 ; VLOPT-NEXT: vmand.mm v0, v0, v8 -; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: vsetvli zero, zero, e32, mf2, tu, mu -; VLOPT-NEXT: vadd.vv v8, v9, v9, v0.t +; VLOPT-NEXT: vadd.vv v9, v9, v9, v0.t +; VLOPT-NEXT: vmv1r.v v8, v9 ; VLOPT-NEXT: ret %1 = call @llvm.riscv.vmxnor.nxv1i1( %a, %b, iXLen -1) %2 = call @llvm.riscv.vmand.nxv1i1( %a, %1, iXLen %vl) diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll index 737ef6bae4e42..e4235d03cda31 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll @@ -49,8 +49,8 @@ define i64 @test_vleff_nxv8i8_mask( %maskedoff, ptr %p, @vmax_vx_nxv32i32_evl_nx16( %va, i ; RV64-LABEL: vmax_vx_nxv32i32_evl_nx16: ; RV64: # %bb.0: ; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV64-NEXT: vmax.vx v8, v8, a0, v0.t ; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v24, v0, a1 -; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV64-NEXT: vmax.vx v8, v8, a0, v0.t -; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: vsetivli zero, 0, e32, m8, ta, ma ; RV64-NEXT: vmax.vx v16, v16, a0, v0.t ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll index 8147d467be04e..df7f177681f5e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll @@ -1076,12 +1076,11 @@ define @vmaxu_vx_nxv32i32_evl_nx16( %va, ; RV64-LABEL: vmaxu_vx_nxv32i32_evl_nx16: ; RV64: # %bb.0: ; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV64-NEXT: vmaxu.vx v8, v8, a0, v0.t ; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v24, v0, a1 -; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV64-NEXT: vmaxu.vx v8, v8, a0, v0.t -; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: vsetivli zero, 0, e32, m8, ta, ma ; RV64-NEXT: vmaxu.vx v16, v16, a0, v0.t ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll index 614bd4cbde9ec..342c037371b57 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll @@ -1077,12 +1077,11 @@ define @vmin_vx_nxv32i32_evl_nx16( %va, i ; RV64-LABEL: vmin_vx_nxv32i32_evl_nx16: ; RV64: # %bb.0: ; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV64-NEXT: vmin.vx v8, v8, a0, v0.t ; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v24, v0, a1 -; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV64-NEXT: vmin.vx v8, v8, a0, v0.t -; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: vsetivli zero, 0, e32, m8, ta, ma ; RV64-NEXT: vmin.vx v16, v16, a0, v0.t ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll index 21160553af59d..6821aa6c7e380 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll @@ -1076,12 +1076,11 @@ define @vminu_vx_nxv32i32_evl_nx16( %va, ; RV64-LABEL: vminu_vx_nxv32i32_evl_nx16: ; RV64: # %bb.0: ; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; RV64-NEXT: vminu.vx v8, v8, a0, v0.t ; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v24, v0, a1 -; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV64-NEXT: vminu.vx v8, v8, a0, v0.t -; RV64-NEXT: vmv1r.v v0, v24 +; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: vsetivli zero, 0, e32, m8, ta, ma ; RV64-NEXT: vminu.vx v16, v16, a0, v0.t ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vmseq.ll b/llvm/test/CodeGen/RISCV/rvv/vmseq.ll index 6407f39a65e8b..275f96d1d526c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmseq.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmseq.ll @@ -1670,12 +1670,12 @@ define @intrinsic_vmseq_mask_vx_nxv1i64_i64( ; RV32-LABEL: intrinsic_vmseq_mask_vx_nxv1i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v11, (a0), zero -; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmseq.vv v10, v8, v11, v0.t ; RV32-NEXT: vmv.v.v v0, v10 @@ -1744,12 +1744,12 @@ define @intrinsic_vmseq_mask_vx_nxv2i64_i64( ; RV32-LABEL: intrinsic_vmseq_mask_vx_nxv2i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vmseq.vv v11, v8, v12, v0.t ; RV32-NEXT: vmv1r.v v0, v11 @@ -1818,12 +1818,12 @@ define @intrinsic_vmseq_mask_vx_nxv4i64_i64( ; RV32-LABEL: intrinsic_vmseq_mask_vx_nxv4i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vmseq.vv v13, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v13 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsge.ll b/llvm/test/CodeGen/RISCV/rvv/vmsge.ll index 45e3840f7e673..2c1a525220eea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmsge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmsge.ll @@ -1725,12 +1725,12 @@ define @intrinsic_vmsge_mask_vx_nxv1i64_i64( ; RV32-LABEL: intrinsic_vmsge_mask_vx_nxv1i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v11, (a0), zero -; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmsle.vv v10, v11, v8, v0.t ; RV32-NEXT: vmv.v.v v0, v10 @@ -1800,12 +1800,12 @@ define @intrinsic_vmsge_mask_vx_nxv2i64_i64( ; RV32-LABEL: intrinsic_vmsge_mask_vx_nxv2i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vmsle.vv v11, v12, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v11 @@ -1875,12 +1875,12 @@ define @intrinsic_vmsge_mask_vx_nxv4i64_i64( ; RV32-LABEL: intrinsic_vmsge_mask_vx_nxv4i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vmsle.vv v13, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v13 @@ -2872,12 +2872,12 @@ define @intrinsic_vmsge_maskedoff_mask_vx_nxv2i64_i64( @intrinsic_vmsge_maskedoff_mask_vx_nxv4i64_i64( @intrinsic_vmsgeu_mask_vx_nxv1i64_i64( ; RV32-LABEL: intrinsic_vmsgeu_mask_vx_nxv1i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v11, (a0), zero -; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmsleu.vv v10, v11, v8, v0.t ; RV32-NEXT: vmv.v.v v0, v10 @@ -1761,12 +1761,12 @@ define @intrinsic_vmsgeu_mask_vx_nxv2i64_i64( ; RV32-LABEL: intrinsic_vmsgeu_mask_vx_nxv2i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vmsleu.vv v11, v12, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v11 @@ -1836,12 +1836,12 @@ define @intrinsic_vmsgeu_mask_vx_nxv4i64_i64( ; RV32-LABEL: intrinsic_vmsgeu_mask_vx_nxv4i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vmsleu.vv v13, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v13 @@ -2851,12 +2851,12 @@ define @intrinsic_vmsgeu_maskedoff_mask_vx_nxv2i64_i64( @intrinsic_vmsgeu_maskedoff_mask_vx_nxv4i64_i64( @intrinsic_vmsgt_mask_vx_nxv1i64_i64( ; RV32-LABEL: intrinsic_vmsgt_mask_vx_nxv1i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v11, (a0), zero -; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmslt.vv v10, v11, v8, v0.t ; RV32-NEXT: vmv.v.v v0, v10 @@ -1744,12 +1744,12 @@ define @intrinsic_vmsgt_mask_vx_nxv2i64_i64( ; RV32-LABEL: intrinsic_vmsgt_mask_vx_nxv2i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vmslt.vv v11, v12, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v11 @@ -1818,12 +1818,12 @@ define @intrinsic_vmsgt_mask_vx_nxv4i64_i64( ; RV32-LABEL: intrinsic_vmsgt_mask_vx_nxv4i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vmslt.vv v13, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v13 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll index d57b9cd5bae53..f67d2ed047ae7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll @@ -1670,12 +1670,12 @@ define @intrinsic_vmsgtu_mask_vx_nxv1i64_i64( ; RV32-LABEL: intrinsic_vmsgtu_mask_vx_nxv1i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v11, (a0), zero -; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmsltu.vv v10, v11, v8, v0.t ; RV32-NEXT: vmv.v.v v0, v10 @@ -1744,12 +1744,12 @@ define @intrinsic_vmsgtu_mask_vx_nxv2i64_i64( ; RV32-LABEL: intrinsic_vmsgtu_mask_vx_nxv2i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vmsltu.vv v11, v12, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v11 @@ -1818,12 +1818,12 @@ define @intrinsic_vmsgtu_mask_vx_nxv4i64_i64( ; RV32-LABEL: intrinsic_vmsgtu_mask_vx_nxv4i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vmsltu.vv v13, v16, v8, v0.t ; RV32-NEXT: vmv1r.v v0, v13 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsle.ll b/llvm/test/CodeGen/RISCV/rvv/vmsle.ll index 9653dfd2518d8..6aed4286c3495 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmsle.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmsle.ll @@ -1670,12 +1670,12 @@ define @intrinsic_vmsle_mask_vx_nxv1i64_i64( ; RV32-LABEL: intrinsic_vmsle_mask_vx_nxv1i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v11, (a0), zero -; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmsle.vv v10, v8, v11, v0.t ; RV32-NEXT: vmv.v.v v0, v10 @@ -1744,12 +1744,12 @@ define @intrinsic_vmsle_mask_vx_nxv2i64_i64( ; RV32-LABEL: intrinsic_vmsle_mask_vx_nxv2i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vmsle.vv v11, v8, v12, v0.t ; RV32-NEXT: vmv1r.v v0, v11 @@ -1818,12 +1818,12 @@ define @intrinsic_vmsle_mask_vx_nxv4i64_i64( ; RV32-LABEL: intrinsic_vmsle_mask_vx_nxv4i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vmsle.vv v13, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v13 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll index 25ecfa65c7c48..d881b12d7c1e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmsleu.ll @@ -1670,12 +1670,12 @@ define @intrinsic_vmsleu_mask_vx_nxv1i64_i64( ; RV32-LABEL: intrinsic_vmsleu_mask_vx_nxv1i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v11, (a0), zero -; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmsleu.vv v10, v8, v11, v0.t ; RV32-NEXT: vmv.v.v v0, v10 @@ -1744,12 +1744,12 @@ define @intrinsic_vmsleu_mask_vx_nxv2i64_i64( ; RV32-LABEL: intrinsic_vmsleu_mask_vx_nxv2i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vmsleu.vv v11, v8, v12, v0.t ; RV32-NEXT: vmv1r.v v0, v11 @@ -1818,12 +1818,12 @@ define @intrinsic_vmsleu_mask_vx_nxv4i64_i64( ; RV32-LABEL: intrinsic_vmsleu_mask_vx_nxv4i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vmsleu.vv v13, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v13 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmslt.ll b/llvm/test/CodeGen/RISCV/rvv/vmslt.ll index c17495e3b2119..26c3493dd03ab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmslt.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmslt.ll @@ -1670,12 +1670,12 @@ define @intrinsic_vmslt_mask_vx_nxv1i64_i64( ; RV32-LABEL: intrinsic_vmslt_mask_vx_nxv1i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v11, (a0), zero -; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmslt.vv v10, v8, v11, v0.t ; RV32-NEXT: vmv.v.v v0, v10 @@ -1744,12 +1744,12 @@ define @intrinsic_vmslt_mask_vx_nxv2i64_i64( ; RV32-LABEL: intrinsic_vmslt_mask_vx_nxv2i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vmslt.vv v11, v8, v12, v0.t ; RV32-NEXT: vmv1r.v v0, v11 @@ -1818,12 +1818,12 @@ define @intrinsic_vmslt_mask_vx_nxv4i64_i64( ; RV32-LABEL: intrinsic_vmslt_mask_vx_nxv4i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vmslt.vv v13, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v13 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll b/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll index a37a02848365d..2d4795b5b8d30 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmsltu.ll @@ -1670,12 +1670,12 @@ define @intrinsic_vmsltu_mask_vx_nxv1i64_i64( ; RV32-LABEL: intrinsic_vmsltu_mask_vx_nxv1i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v11, (a0), zero -; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmsltu.vv v10, v8, v11, v0.t ; RV32-NEXT: vmv.v.v v0, v10 @@ -1744,12 +1744,12 @@ define @intrinsic_vmsltu_mask_vx_nxv2i64_i64( ; RV32-LABEL: intrinsic_vmsltu_mask_vx_nxv2i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vmsltu.vv v11, v8, v12, v0.t ; RV32-NEXT: vmv1r.v v0, v11 @@ -1818,12 +1818,12 @@ define @intrinsic_vmsltu_mask_vx_nxv4i64_i64( ; RV32-LABEL: intrinsic_vmsltu_mask_vx_nxv4i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vmsltu.vv v13, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v13 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsne.ll b/llvm/test/CodeGen/RISCV/rvv/vmsne.ll index ed41a18dcc8d3..9d43267f511e3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmsne.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmsne.ll @@ -1670,12 +1670,12 @@ define @intrinsic_vmsne_mask_vx_nxv1i64_i64( ; RV32-LABEL: intrinsic_vmsne_mask_vx_nxv1i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vlse64.v v11, (a0), zero -; RV32-NEXT: vmv1r.v v10, v0 ; RV32-NEXT: vmv1r.v v0, v9 ; RV32-NEXT: vmsne.vv v10, v8, v11, v0.t ; RV32-NEXT: vmv.v.v v0, v10 @@ -1744,12 +1744,12 @@ define @intrinsic_vmsne_mask_vx_nxv2i64_i64( ; RV32-LABEL: intrinsic_vmsne_mask_vx_nxv2i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmv1r.v v11, v0 ; RV32-NEXT: vmv1r.v v0, v10 ; RV32-NEXT: vmsne.vv v11, v8, v12, v0.t ; RV32-NEXT: vmv1r.v v0, v11 @@ -1818,12 +1818,12 @@ define @intrinsic_vmsne_mask_vx_nxv4i64_i64( ; RV32-LABEL: intrinsic_vmsne_mask_vx_nxv4i64_i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: sw a0, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vmv1r.v v13, v0 ; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: vmsne.vv v13, v8, v16, v0.t ; RV32-NEXT: vmv1r.v v0, v13 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll index 4629db26ca034..647960a404d4b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll @@ -248,8 +248,8 @@ define @intrinsic_vmv.s.x_x_nxv1i64( %0, i6 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu ; RV32-NEXT: vid.v v9 -; RV32-NEXT: vmseq.vi v0, v9, 0 ; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vmseq.vi v0, v9, 0 ; RV32-NEXT: vlse64.v v8, (a0), zero, v0.t ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -274,8 +274,8 @@ define @intrinsic_vmv.s.x_x_nxv2i64( %0, i6 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu ; RV32-NEXT: vid.v v10 -; RV32-NEXT: vmseq.vi v0, v10, 0 ; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vmseq.vi v0, v10, 0 ; RV32-NEXT: vlse64.v v8, (a0), zero, v0.t ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -300,8 +300,8 @@ define @intrinsic_vmv.s.x_x_nxv4i64( %0, i6 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu ; RV32-NEXT: vid.v v12 -; RV32-NEXT: vmseq.vi v0, v12, 0 ; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vmseq.vi v0, v12, 0 ; RV32-NEXT: vlse64.v v8, (a0), zero, v0.t ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -326,8 +326,8 @@ define @intrinsic_vmv.s.x_x_nxv8i64( %0, i6 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu ; RV32-NEXT: vid.v v16 -; RV32-NEXT: vmseq.vi v0, v16, 0 ; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vmseq.vi v0, v16, 0 ; RV32-NEXT: vlse64.v v8, (a0), zero, v0.t ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv0-elimination.ll b/llvm/test/CodeGen/RISCV/rvv/vmv0-elimination.ll new file mode 100644 index 0000000000000..ba885abdce441 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vmv0-elimination.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s + +; We have an invariant that any vmv0 use won't clobber an existing v0 definition that's used. +; Check that %asm2 has a $v0 = COPY just before it so that %x doesn't clobber it. +define @between_inline_asm( %a, %b, %mask, ptr %p) { +; CHECK-LABEL: between_inline_asm: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v0 +; CHECK-NEXT: #APP +; CHECK-NEXT: vadd.vv v0, v8, v9 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vmv1r.v v11, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vadd.vv v9, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vs1r.v v9, (a0) +; CHECK-NEXT: #APP +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: ret + %asm1 = tail call asm "vadd.vv $0, $1, $2", "={v0},^vr,^vr"( %a, %b) + %x = call @llvm.riscv.vadd.mask( poison, %a, %b, %mask, i64 -1, i64 0) + store %x, ptr %p + %asm2 = tail call asm "vadd.vv $0, $1, $2", "=^vr,^vr,{v0}"( %a, %asm1) + ret %asm2 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll index a2466c48b0ab7..622f7dfebec9c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-combine-store-reverse.ll @@ -65,9 +65,10 @@ define void @test_different_evl( %val, * ; CHECK-NEXT: vrsub.vx v11, v11, a1 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vrgatherei16.vv v12, v10, v9 -; CHECK-NEXT: vmsne.vi v0, v12, 0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vrgather.vv v9, v8, v11 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-NEXT: vmsne.vi v0, v12, 0 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v9, (a0), v0.t ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll b/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll index b316f5f878816..1c3f2ed6f81b9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-cttz-elts.ll @@ -169,18 +169,18 @@ define i1 @nxv2i32_cmp_evl( %src, %m, i32 %e ; ; RV64-LABEL: nxv2i32_cmp_evl: ; RV64: # %bb.0: -; RV64-NEXT: slli a1, a0, 32 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: sext.w a1, a0 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; RV64-NEXT: vmsne.vi v8, v8, 0, v0.t ; RV64-NEXT: vfirst.m a2, v8, v0.t -; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: bltz a2, .LBB6_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a1, a2 +; RV64-NEXT: mv a0, a2 ; RV64-NEXT: .LBB6_2: -; RV64-NEXT: sext.w a1, a1 -; RV64-NEXT: xor a0, a1, a0 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: xor a0, a0, a1 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: ret %r = call i32 @llvm.vp.cttz.elts.i32.nxv2i32( %src, i1 0, %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index e481891dfd52f..2214523c58e5b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -331,8 +331,9 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract( % ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma ; RV32-NEXT: vmv1r.v v8, v0 +; RV32-NEXT: slli a2, a1, 1 ; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: li a2, -1 +; RV32-NEXT: li a1, -1 ; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: csrr a3, vlenb @@ -340,20 +341,19 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract( % ; RV32-NEXT: vmerge.vim v11, v9, 1, v0 ; RV32-NEXT: srli a3, a3, 2 ; RV32-NEXT: vwaddu.vv v12, v11, v11 -; RV32-NEXT: vwmaccu.vx v12, a2, v11 +; RV32-NEXT: vwmaccu.vx v12, a1, v11 +; RV32-NEXT: add a1, a3, a3 ; RV32-NEXT: vmsne.vi v0, v12, 0 -; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; RV32-NEXT: vslidedown.vx v11, v12, a3 ; RV32-NEXT: vmerge.vim v10, v10, 1, v0 -; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; RV32-NEXT: vmsne.vi v0, v11, 0 -; RV32-NEXT: add a2, a3, a3 ; RV32-NEXT: vmerge.vim v9, v9, 1, v0 -; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; RV32-NEXT: vslideup.vx v10, v9, a3 -; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; RV32-NEXT: vmsne.vi v0, v10, 0 -; RV32-NEXT: slli a2, a1, 1 ; RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma ; RV32-NEXT: vle32.v v10, (a0), v0.t ; RV32-NEXT: li a1, 32 @@ -383,19 +383,19 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract( % ; RV64-NEXT: srli a3, a3, 2 ; RV64-NEXT: vwaddu.vv v12, v11, v11 ; RV64-NEXT: vwmaccu.vx v12, a2, v11 +; RV64-NEXT: add a1, a3, a3 ; RV64-NEXT: vmsne.vi v0, v12, 0 -; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; RV64-NEXT: vslidedown.vx v11, v12, a3 ; RV64-NEXT: vmerge.vim v10, v10, 1, v0 -; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma ; RV64-NEXT: vmsne.vi v0, v11, 0 -; RV64-NEXT: add a1, a3, a3 ; RV64-NEXT: vmerge.vim v9, v9, 1, v0 ; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vx v10, v9, a3 -; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV64-NEXT: vmsne.vi v0, v10, 0 ; RV64-NEXT: srli a1, a4, 32 +; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vi v0, v10, 0 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vle32.v v10, (a0), v0.t ; RV64-NEXT: li a1, 32 @@ -676,6 +676,7 @@ define {, } @not_same_mask( ; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma ; RV32-NEXT: vmv1r.v v9, v0 ; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: li a2, -1 ; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma @@ -688,19 +689,18 @@ define {, } @not_same_mask( ; RV32-NEXT: srli a3, a3, 2 ; RV32-NEXT: vwaddu.vv v12, v9, v11 ; RV32-NEXT: vwmaccu.vx v12, a2, v11 +; RV32-NEXT: add a2, a3, a3 ; RV32-NEXT: vmsne.vi v0, v12, 0 -; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; RV32-NEXT: vslidedown.vx v9, v12, a3 ; RV32-NEXT: vmerge.vim v10, v10, 1, v0 -; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; RV32-NEXT: vmsne.vi v0, v9, 0 -; RV32-NEXT: add a2, a3, a3 ; RV32-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma ; RV32-NEXT: vslideup.vx v10, v8, a3 ; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; RV32-NEXT: vmsne.vi v0, v10, 0 -; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV32-NEXT: vle32.v v10, (a0), v0.t ; RV32-NEXT: li a0, 32 @@ -725,21 +725,21 @@ define {, } @not_same_mask( ; RV64-NEXT: vmv1r.v v0, v9 ; RV64-NEXT: vmerge.vim v9, v8, 1, v0 ; RV64-NEXT: srli a3, a3, 2 +; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vwaddu.vv v12, v9, v11 ; RV64-NEXT: vwmaccu.vx v12, a2, v11 +; RV64-NEXT: add a2, a3, a3 ; RV64-NEXT: vmsne.vi v0, v12, 0 -; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV64-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; RV64-NEXT: vslidedown.vx v9, v12, a3 ; RV64-NEXT: vmerge.vim v10, v10, 1, v0 -; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; RV64-NEXT: vmsne.vi v0, v9, 0 -; RV64-NEXT: add a2, a3, a3 ; RV64-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-NEXT: vsetvli zero, a2, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vx v10, v8, a3 ; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; RV64-NEXT: vmsne.vi v0, v10, 0 -; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vle32.v v10, (a0), v0.t ; RV64-NEXT: li a0, 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll index 1007d1ce649cc..eacc9b329fba3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll @@ -2435,11 +2435,11 @@ define @vpgather_nxv16f64( %ptrs, @vpgather_nxv16f64( %ptrs, @vpgather_nxv16f64( %ptrs, @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, %idxs, %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_nxv16i16_nxv16f64: ; RV32: # %bb.0: -; RV32-NEXT: li a3, 8 +; RV32-NEXT: li a2, 8 +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; RV32-NEXT: vwmulsu.vx v24, v8, a2 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; RV32-NEXT: vwmulsu.vx v24, v8, a3 ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: bltu a1, a2, .LBB112_2 ; RV32-NEXT: # %bb.1: @@ -2495,9 +2495,9 @@ define @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base, %idxs, %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_sext_nxv16i16_nxv16f64: ; RV32: # %bb.0: -; RV32-NEXT: li a3, 8 +; RV32-NEXT: li a2, 8 +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; RV32-NEXT: vwmulsu.vx v24, v8, a2 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; RV32-NEXT: vwmulsu.vx v24, v8, a3 ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: bltu a1, a2, .LBB113_2 ; RV32-NEXT: # %bb.1: @@ -2552,9 +2551,9 @@ define @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base ; RV32-NEXT: srli a2, a2, 3 ; RV32-NEXT: sltu a1, a1, a3 ; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vx v0, v0, a2 ; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vx v0, v0, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: ret @@ -2564,20 +2563,19 @@ define @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base ; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: vsext.vf4 v16, v10 +; RV64-NEXT: vsext.vf4 v24, v8 ; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: sub a3, a1, a2 -; RV64-NEXT: srli a4, a2, 3 -; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a4 ; RV64-NEXT: sltu a4, a1, a3 ; RV64-NEXT: addi a4, a4, -1 ; RV64-NEXT: and a3, a4, a3 +; RV64-NEXT: srli a4, a2, 3 +; RV64-NEXT: vsetvli a5, zero, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vx v0, v0, a4 ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV64-NEXT: vluxei64.v v16, (a0), v16, v0.t -; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma -; RV64-NEXT: vsext.vf4 v24, v8 -; RV64-NEXT: vsll.vi v24, v24, 3 ; RV64-NEXT: bltu a1, a2, .LBB113_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a1, a2 @@ -2595,10 +2593,10 @@ define @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base define @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base, %idxs, %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64: ; RV32: # %bb.0: -; RV32-NEXT: li a3, 8 +; RV32-NEXT: li a2, 8 +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; RV32-NEXT: vwmulu.vx v24, v8, a2 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; RV32-NEXT: vwmulu.vx v24, v8, a3 ; RV32-NEXT: mv a3, a1 ; RV32-NEXT: bltu a1, a2, .LBB114_2 ; RV32-NEXT: # %bb.1: @@ -2610,19 +2608,19 @@ define @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base ; RV32-NEXT: srli a2, a2, 3 ; RV32-NEXT: sltu a1, a1, a3 ; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vx v0, v0, a2 ; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vx v0, v0, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_nxv16i16_nxv16f64: ; RV64: # %bb.0: -; RV64-NEXT: li a3, 8 +; RV64-NEXT: li a2, 8 +; RV64-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; RV64-NEXT: vwmulu.vx v24, v8, a2 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; RV64-NEXT: vwmulu.vx v24, v8, a3 ; RV64-NEXT: mv a3, a1 ; RV64-NEXT: bltu a1, a2, .LBB114_2 ; RV64-NEXT: # %bb.1: @@ -2634,9 +2632,9 @@ define @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base ; RV64-NEXT: srli a2, a2, 3 ; RV64-NEXT: sltu a1, a1, a3 ; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a2 ; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vx v0, v0, a2 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei32.v v16, (a0), v28, v0.t ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll index 0844180e49612..b73659e7ce415 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll @@ -527,12 +527,12 @@ define @vpload_nxv16f64(ptr %ptr, %m, ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: sub a3, a1, a2 ; CHECK-NEXT: slli a4, a2, 3 -; CHECK-NEXT: srli a5, a2, 3 -; CHECK-NEXT: vslidedown.vx v0, v0, a5 ; CHECK-NEXT: sltu a5, a1, a3 ; CHECK-NEXT: addi a5, a5, -1 ; CHECK-NEXT: and a3, a5, a3 +; CHECK-NEXT: srli a5, a2, 3 ; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vslidedown.vx v0, v0, a5 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v16, (a4), v0.t ; CHECK-NEXT: bltu a1, a2, .LBB44_2 @@ -591,9 +591,9 @@ define @vpload_nxv17f64(ptr %ptr, ptr %out, @llvm.vp.merge.nxv128i8(, @vpmerge_vv_nxv128i8( %va, %vb, %m, i32 zeroext %evl) { ; CHECK-LABEL: vpmerge_vv_nxv128i8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: vmv8r.v v24, v16 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: vlm.v v0, (a2) ; CHECK-NEXT: slli a1, a1, 3 @@ -572,26 +564,19 @@ define @vpmerge_vv_nxv128i8( %va, @llvm.vp.merge.nxv128i8( %m, %va, %vb, i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll index 2cf6248c17598..9340be684f2cf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll @@ -2268,9 +2268,9 @@ define void @vpscatter_nxv16f64( %val, ; RV32-NEXT: srli a0, a0, 3 ; RV32-NEXT: sltu a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vx v0, v0, a0 ; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vx v0, v0, a0 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (zero), v28, v0.t ; RV32-NEXT: ret @@ -2289,21 +2289,21 @@ define void @vpscatter_nxv16f64( %val, ; RV64-NEXT: slli a3, a1, 3 ; RV64-NEXT: add a3, a0, a3 ; RV64-NEXT: vl8re64.v v16, (a3) +; RV64-NEXT: mv a3, a2 ; RV64-NEXT: vl8re64.v v24, (a0) -; RV64-NEXT: mv a0, a2 ; RV64-NEXT: bltu a2, a1, .LBB108_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a1 +; RV64-NEXT: mv a3, a1 ; RV64-NEXT: .LBB108_2: -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t ; RV64-NEXT: sub a0, a2, a1 ; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: sltu a2, a2, a0 ; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma @@ -2323,10 +2323,10 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64( %val, pt ; RV32-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64: ; RV32: # %bb.0: ; RV32-NEXT: vl4re16.v v4, (a1) -; RV32-NEXT: li a3, 8 +; RV32-NEXT: li a1, 8 +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; RV32-NEXT: vwmulsu.vx v24, v4, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; RV32-NEXT: vwmulsu.vx v24, v4, a3 ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: bltu a2, a1, .LBB109_2 ; RV32-NEXT: # %bb.1: @@ -2338,9 +2338,9 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64( %val, pt ; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: sltu a2, a2, a3 ; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vx v0, v0, a1 ; RV32-NEXT: and a2, a2, a3 +; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vx v0, v0, a1 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: ret @@ -2359,14 +2359,14 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64( %val, pt ; RV64-NEXT: addi a3, a3, 16 ; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vl4re16.v v24, (a1) -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf4 v16, v26 ; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsext.vf4 v16, v24 ; RV64-NEXT: vsll.vi v24, v16, 3 +; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: mv a3, a2 ; RV64-NEXT: bltu a2, a1, .LBB109_2 ; RV64-NEXT: # %bb.1: @@ -2378,9 +2378,9 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64( %val, pt ; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: sltu a2, a2, a3 ; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: and a2, a2, a3 +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 @@ -2406,10 +2406,10 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64( %va ; RV32-LABEL: vpscatter_baseidx_sext_nxv16i16_nxv16f64: ; RV32: # %bb.0: ; RV32-NEXT: vl4re16.v v4, (a1) -; RV32-NEXT: li a3, 8 +; RV32-NEXT: li a1, 8 +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; RV32-NEXT: vwmulsu.vx v24, v4, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; RV32-NEXT: vwmulsu.vx v24, v4, a3 ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: bltu a2, a1, .LBB110_2 ; RV32-NEXT: # %bb.1: @@ -2421,9 +2421,9 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64( %va ; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: sltu a2, a2, a3 ; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vx v0, v0, a1 ; RV32-NEXT: and a2, a2, a3 +; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vx v0, v0, a1 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: ret @@ -2442,14 +2442,14 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64( %va ; RV64-NEXT: addi a3, a3, 16 ; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV64-NEXT: vl4re16.v v24, (a1) -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf4 v16, v26 ; RV64-NEXT: vsll.vi v16, v16, 3 -; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsext.vf4 v16, v24 ; RV64-NEXT: vsll.vi v24, v16, 3 +; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: mv a3, a2 ; RV64-NEXT: bltu a2, a1, .LBB110_2 ; RV64-NEXT: # %bb.1: @@ -2461,9 +2461,9 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64( %va ; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: sltu a2, a2, a3 ; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: and a2, a2, a3 +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 @@ -2490,10 +2490,10 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64( %va ; RV32-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64: ; RV32: # %bb.0: ; RV32-NEXT: vl4re16.v v4, (a1) -; RV32-NEXT: li a3, 8 +; RV32-NEXT: li a1, 8 +; RV32-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; RV32-NEXT: vwmulu.vx v24, v4, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; RV32-NEXT: vwmulu.vx v24, v4, a3 ; RV32-NEXT: mv a3, a2 ; RV32-NEXT: bltu a2, a1, .LBB111_2 ; RV32-NEXT: # %bb.1: @@ -2505,9 +2505,9 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64( %va ; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: sltu a2, a2, a3 ; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vx v0, v0, a1 ; RV32-NEXT: and a2, a2, a3 +; RV32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vx v0, v0, a1 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t ; RV32-NEXT: ret @@ -2515,10 +2515,10 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64( %va ; RV64-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64: ; RV64: # %bb.0: ; RV64-NEXT: vl4re16.v v4, (a1) -; RV64-NEXT: li a3, 8 +; RV64-NEXT: li a1, 8 +; RV64-NEXT: vsetvli a3, zero, e16, m4, ta, ma +; RV64-NEXT: vwmulu.vx v24, v4, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: vsetvli a4, zero, e16, m4, ta, ma -; RV64-NEXT: vwmulu.vx v24, v4, a3 ; RV64-NEXT: mv a3, a2 ; RV64-NEXT: bltu a2, a1, .LBB111_2 ; RV64-NEXT: # %bb.1: @@ -2530,9 +2530,9 @@ define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64( %va ; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: sltu a2, a2, a3 ; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: and a2, a2, a3 +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vx v0, v0, a1 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV64-NEXT: vsoxei32.v v16, (a0), v28, v0.t ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll index 7e7da529bf3d7..5cb4176a1be19 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll @@ -439,15 +439,15 @@ define void @vpstore_nxv16f64( %val, ptr %ptr, %val, ptr %ptr, %v) { ; CHECK-LABEL: vreduce_fmin_nxv10f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 10 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: lui a1, %hi(.LCPI73_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI73_0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v12, (a1) -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: li a1, 10 -; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfredmin.vs v12, v8, v12 ; CHECK-NEXT: vfmv.f.s fa0, v12 diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-subreg-liveness.ll index 7b460f2c058f8..df0792a68e05a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-subreg-liveness.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-subreg-liveness.ll @@ -12,11 +12,11 @@ define internal void @foo( %v15, %0, %vs12.i.i.i, %1, %v37) { ; NOSUBREG-LABEL: foo: ; NOSUBREG: # %bb.0: # %loopIR.preheader.i.i -; NOSUBREG-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; NOSUBREG-NEXT: vmv.v.i v9, 0 -; NOSUBREG-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; NOSUBREG-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; NOSUBREG-NEXT: vmv.v.i v14, 0 -; NOSUBREG-NEXT: vmv1r.v v8, v9 +; NOSUBREG-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; NOSUBREG-NEXT: vmv.v.i v9, 0 +; NOSUBREG-NEXT: vmv.v.i v8, 0 ; NOSUBREG-NEXT: vsetivli zero, 4, e8, m1, tu, ma ; NOSUBREG-NEXT: vrgatherei16.vv v8, v9, v14 ; NOSUBREG-NEXT: .LBB0_1: # %loopIR3.i.i @@ -32,11 +32,11 @@ define internal void @foo( %v15, %0, @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y, ; NO_FOLDING: # %bb.0: ; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, mu ; NO_FOLDING-NEXT: vlm.v v0, (a0) -; NO_FOLDING-NEXT: vlm.v v8, (a2) -; NO_FOLDING-NEXT: vlm.v v9, (a1) +; NO_FOLDING-NEXT: vlm.v v8, (a1) +; NO_FOLDING-NEXT: vlm.v v9, (a2) ; NO_FOLDING-NEXT: vmv.v.i v10, 0 ; NO_FOLDING-NEXT: vmerge.vim v11, v10, 1, v0 -; NO_FOLDING-NEXT: vmv.v.v v0, v8 -; NO_FOLDING-NEXT: vmerge.vim v8, v10, 1, v0 -; NO_FOLDING-NEXT: vadd.vv v10, v11, v8 -; NO_FOLDING-NEXT: vsub.vv v8, v11, v8 ; NO_FOLDING-NEXT: vmv.v.v v0, v9 +; NO_FOLDING-NEXT: vmerge.vim v9, v10, 1, v0 +; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING-NEXT: vmv.v.v v0, v8 ; NO_FOLDING-NEXT: vor.vv v10, v10, v11, v0.t -; NO_FOLDING-NEXT: vor.vv v8, v10, v8 +; NO_FOLDING-NEXT: vor.vv v8, v10, v9 ; NO_FOLDING-NEXT: ret ; ; FOLDING-LABEL: vwop_vscale_zext_i1i32_multiple_users: ; FOLDING: # %bb.0: ; FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, mu ; FOLDING-NEXT: vlm.v v0, (a0) -; FOLDING-NEXT: vlm.v v8, (a2) -; FOLDING-NEXT: vlm.v v9, (a1) +; FOLDING-NEXT: vlm.v v8, (a1) +; FOLDING-NEXT: vlm.v v9, (a2) ; FOLDING-NEXT: vmv.v.i v10, 0 ; FOLDING-NEXT: vmerge.vim v11, v10, 1, v0 -; FOLDING-NEXT: vmv.v.v v0, v8 -; FOLDING-NEXT: vmerge.vim v8, v10, 1, v0 -; FOLDING-NEXT: vadd.vv v10, v11, v8 -; FOLDING-NEXT: vsub.vv v8, v11, v8 ; FOLDING-NEXT: vmv.v.v v0, v9 +; FOLDING-NEXT: vmerge.vim v9, v10, 1, v0 +; FOLDING-NEXT: vadd.vv v10, v11, v9 +; FOLDING-NEXT: vsub.vv v9, v11, v9 +; FOLDING-NEXT: vmv.v.v v0, v8 ; FOLDING-NEXT: vor.vv v10, v10, v11, v0.t -; FOLDING-NEXT: vor.vv v8, v10, v8 +; FOLDING-NEXT: vor.vv v8, v10, v9 ; FOLDING-NEXT: ret %a = load , ptr %x %b = load , ptr %y @@ -496,34 +496,34 @@ define @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, p ; NO_FOLDING: # %bb.0: ; NO_FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; NO_FOLDING-NEXT: vlm.v v0, (a0) -; NO_FOLDING-NEXT: vlm.v v8, (a2) -; NO_FOLDING-NEXT: vlm.v v9, (a1) +; NO_FOLDING-NEXT: vlm.v v8, (a1) +; NO_FOLDING-NEXT: vlm.v v9, (a2) ; NO_FOLDING-NEXT: vmv.v.i v10, 0 ; NO_FOLDING-NEXT: vmerge.vim v11, v10, 1, v0 -; NO_FOLDING-NEXT: vmv1r.v v0, v8 -; NO_FOLDING-NEXT: vmerge.vim v8, v10, 1, v0 -; NO_FOLDING-NEXT: vadd.vv v10, v11, v8 -; NO_FOLDING-NEXT: vsub.vv v8, v11, v8 ; NO_FOLDING-NEXT: vmv1r.v v0, v9 +; NO_FOLDING-NEXT: vmerge.vim v9, v10, 1, v0 +; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING-NEXT: vmv1r.v v0, v8 ; NO_FOLDING-NEXT: vor.vv v10, v10, v11, v0.t -; NO_FOLDING-NEXT: vor.vv v8, v10, v8 +; NO_FOLDING-NEXT: vor.vv v8, v10, v9 ; NO_FOLDING-NEXT: ret ; ; FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users: ; FOLDING: # %bb.0: ; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; FOLDING-NEXT: vlm.v v0, (a0) -; FOLDING-NEXT: vlm.v v8, (a2) -; FOLDING-NEXT: vlm.v v9, (a1) +; FOLDING-NEXT: vlm.v v8, (a1) +; FOLDING-NEXT: vlm.v v9, (a2) ; FOLDING-NEXT: vmv.v.i v10, 0 ; FOLDING-NEXT: vmerge.vim v11, v10, 1, v0 -; FOLDING-NEXT: vmv1r.v v0, v8 -; FOLDING-NEXT: vmerge.vim v8, v10, 1, v0 -; FOLDING-NEXT: vadd.vv v10, v11, v8 -; FOLDING-NEXT: vsub.vv v8, v11, v8 ; FOLDING-NEXT: vmv1r.v v0, v9 +; FOLDING-NEXT: vmerge.vim v9, v10, 1, v0 +; FOLDING-NEXT: vadd.vv v10, v11, v9 +; FOLDING-NEXT: vsub.vv v9, v11, v9 +; FOLDING-NEXT: vmv1r.v v0, v8 ; FOLDING-NEXT: vor.vv v10, v10, v11, v0.t -; FOLDING-NEXT: vor.vv v8, v10, v8 +; FOLDING-NEXT: vor.vv v8, v10, v9 ; FOLDING-NEXT: ret %a = load , ptr %x %b = load , ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll index be2fc6955294d..cc923d8acd245 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll @@ -519,10 +519,10 @@ define void @vselect_legalize_regression( %a, @llvm.vp.select.nxv32i32(, @select_nxv32i32( %a, %b, %c, i32 zeroext %evl) { ; CHECK-LABEL: select_nxv32i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: slli a4, a3, 3 ; CHECK-NEXT: slli a1, a3, 1 ; CHECK-NEXT: srli a3, a3, 2 ; CHECK-NEXT: add a4, a0, a4 -; CHECK-NEXT: sub a5, a2, a1 -; CHECK-NEXT: vl8re32.v v8, (a4) -; CHECK-NEXT: sltu a4, a2, a5 +; CHECK-NEXT: vslidedown.vx v0, v0, a3 +; CHECK-NEXT: sub a3, a2, a1 +; CHECK-NEXT: vl8re32.v v24, (a4) +; CHECK-NEXT: sltu a4, a2, a3 ; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: vl8re32.v v0, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a3 -; CHECK-NEXT: and a4, a4, a5 -; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: and a3, a4, a3 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vl8re32.v v24, (a0) ; CHECK-NEXT: bltu a2, a1, .LBB27_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.select.nxv32i32( %a, %b, %c, i32 %evl) ret %v @@ -410,55 +384,29 @@ declare i32 @llvm.vscale.i32() define @select_evl_nxv32i32( %a, %b, %c) { ; CHECK-LABEL: select_evl_nxv32i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 ; CHECK-NEXT: slli a2, a1, 1 ; CHECK-NEXT: srli a4, a1, 2 ; CHECK-NEXT: add a3, a0, a3 -; CHECK-NEXT: sub a5, a1, a2 -; CHECK-NEXT: vl8re32.v v8, (a3) -; CHECK-NEXT: sltu a3, a1, a5 +; CHECK-NEXT: vslidedown.vx v0, v0, a4 +; CHECK-NEXT: sub a4, a1, a2 +; CHECK-NEXT: vl8re32.v v24, (a3) +; CHECK-NEXT: sltu a3, a1, a4 ; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: vl8re32.v v0, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a4 -; CHECK-NEXT: and a3, a3, a5 +; CHECK-NEXT: and a3, a3, a4 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vl8re32.v v24, (a0) ; CHECK-NEXT: bltu a1, a2, .LBB28_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a1, a2 ; CHECK-NEXT: .LBB28_2: -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %evl = call i32 @llvm.vscale.i32() %evl0 = mul i32 %evl, 8 @@ -699,54 +647,28 @@ declare @llvm.vp.select.nxv16f64(, @select_nxv16f64( %a, %b, %c, i32 zeroext %evl) { ; CHECK-LABEL: select_nxv16f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: vmv1r.v v7, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a3, a1, 3 +; CHECK-NEXT: srli a4, a1, 3 +; CHECK-NEXT: vslidedown.vx v0, v0, a4 ; CHECK-NEXT: sub a4, a2, a1 ; CHECK-NEXT: add a3, a0, a3 -; CHECK-NEXT: sltu a5, a2, a4 -; CHECK-NEXT: vl8re64.v v8, (a3) -; CHECK-NEXT: addi a5, a5, -1 -; CHECK-NEXT: srli a3, a1, 3 -; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a3 -; CHECK-NEXT: and a4, a5, a4 -; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma -; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: vl8re64.v v24, (a3) +; CHECK-NEXT: sltu a3, a2, a4 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a4 +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: bltu a2, a1, .LBB48_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: .LBB48_2: -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %v = call @llvm.vp.select.nxv16f64( %a, %b, %c, i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll index 5b577dc0f8df9..f359fbfc63632 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll @@ -126,10 +126,10 @@ define @test4(i64 %avl, i8 zeroext %cond, @test5(i64 %avl, i8 zeroext %cond, %a, %b) nounwind { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: andi a2, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: bnez a2, .LBB4_3 +; CHECK-NEXT: andi a0, a1, 1 +; CHECK-NEXT: bnez a0, .LBB4_3 ; CHECK-NEXT: # %bb.1: # %if.else ; CHECK-NEXT: vfsub.vv v9, v8, v9 ; CHECK-NEXT: andi a1, a1, 2 @@ -234,8 +234,8 @@ if.end6: ; preds = %if.else5, %if.then4 define @test6(i64 %avl, i8 zeroext %cond, %a, %b) nounwind { ; CHECK-LABEL: test6: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: andi a2, a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: andi a2, a1, 1 ; CHECK-NEXT: bnez a2, .LBB5_3 ; CHECK-NEXT: # %bb.1: # %if.else ; CHECK-NEXT: vfsub.vv v8, v8, v9 @@ -245,9 +245,9 @@ define @test6(i64 %avl, i8 zeroext %cond, This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a1) -; CHECK-NEXT: vle32.v v16, (a2) ; CHECK-NEXT: slli a4, a3, 2 +; CHECK-NEXT: vle32.v v16, (a2) ; CHECK-NEXT: sub a0, a0, a3 ; CHECK-NEXT: add a1, a1, a4 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, tu, ma ; CHECK-NEXT: vfmacc.vf v16, fa0, v8 ; CHECK-NEXT: vse32.v v16, (a2) -; CHECK-NEXT: vsetvli a3, a0, e32, m8, ta, ma ; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: vsetvli a3, a0, e32, m8, ta, ma ; CHECK-NEXT: bnez a3, .LBB8_1 ; CHECK-NEXT: .LBB8_2: # %for.end ; CHECK-NEXT: ret @@ -494,15 +494,15 @@ define void @saxpy_vec_demanded_fields(i64 %n, float %a, ptr nocapture readonly ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a1) -; CHECK-NEXT: vle32.v v16, (a2) ; CHECK-NEXT: slli a4, a3, 2 +; CHECK-NEXT: vle32.v v16, (a2) ; CHECK-NEXT: sub a0, a0, a3 ; CHECK-NEXT: add a1, a1, a4 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, tu, ma ; CHECK-NEXT: vfmacc.vf v16, fa0, v8 ; CHECK-NEXT: vse32.v v16, (a2) -; CHECK-NEXT: vsetvli a3, a0, e16, m4, ta, ma ; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: vsetvli a3, a0, e16, m4, ta, ma ; CHECK-NEXT: bnez a3, .LBB9_1 ; CHECK-NEXT: .LBB9_2: # %for.end ; CHECK-NEXT: ret @@ -544,9 +544,9 @@ declare void @llvm.riscv.vse.nxv16f32.i64(, ptr nocapture, define @test_vsetvli_x0_x0(ptr %x, ptr %y, %z, i64 %vl, i1 %cond) nounwind { ; CHECK-LABEL: test_vsetvli_x0_x0: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: andi a3, a3, 1 ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: andi a3, a3, 1 ; CHECK-NEXT: beqz a3, .LBB10_2 ; CHECK-NEXT: # %bb.1: # %if ; CHECK-NEXT: vle16.v v10, (a1) @@ -583,9 +583,9 @@ declare @llvm.riscv.vadd.nxv2i32(, @test_vsetvli_x0_x0_2(ptr %x, ptr %y, ptr %z, i64 %vl, i1 %cond, i1 %cond2, %w) nounwind { ; CHECK-LABEL: test_vsetvli_x0_x0_2: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: andi a4, a4, 1 ; CHECK-NEXT: vsetvli zero, a3, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: andi a4, a4, 1 ; CHECK-NEXT: beqz a4, .LBB11_2 ; CHECK-NEXT: # %bb.1: # %if ; CHECK-NEXT: vle16.v v10, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll index 8b48dc43eca29..fd690bb31f716 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll @@ -109,13 +109,13 @@ define void @test6(ptr nocapture readonly %A, ptr nocapture %B, i64 %n) { ; CHECK-NEXT: .LBB5_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: slli a4, a3, 2 +; CHECK-NEXT: add a3, a3, a2 ; CHECK-NEXT: add a5, a0, a4 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vle32.v v8, (a5) -; CHECK-NEXT: add a3, a3, a2 ; CHECK-NEXT: vmsle.vi v9, v8, -3 ; CHECK-NEXT: vmsgt.vi v10, v8, 2 ; CHECK-NEXT: vmor.mm v0, v9, v10 -; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vse32.v v8, (a4), v0.t ; CHECK-NEXT: vsetvli a2, a2, e32, m1, ta, ma ; CHECK-NEXT: bnez a2, .LBB5_2 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll index c3b19b59ec3d6..f658a2c6b24a6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll @@ -11,10 +11,9 @@ define i32 @illegal_preserve_vl( %a, %x, pt ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; CHECK-NEXT: vadd.vv v12, v12, v12 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: vs4r.v v12, (a0) -; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %index = add %x, %x store %index, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll index fd5bf4ebcede8..de12e23345f08 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll @@ -290,69 +290,68 @@ define @vtrunc_nxv32i64_nxv32i32( %a, @vwaddu_vv_mask_v8i32( %x, @vwadd_wv_mask_v8i32_nonzero( %x, @i1_zext( %va, %vb ; ; RV64-LABEL: i1_zext: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 42 -; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, mu ; RV64-NEXT: vadd.vi v8, v8, 1, v0.t +; RV64-NEXT: li a1, 42 ; RV64-NEXT: sh a1, 0(a0) ; RV64-NEXT: ret %vc = zext %va to diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll index 04ece9d94880c..dcbb1a88d3731 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll @@ -41,8 +41,8 @@ define @vwsubu_vv_mask_v8i32( %x, @vwsub_wv_mask_v8i32_nonzero( %x, This Loop Header: Depth=1 @@ -102,17 +100,17 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV32-NEXT: add s0, a2, t6 ; RV32-NEXT: add s1, a4, t6 ; RV32-NEXT: vl2r.v v8, (s0) -; RV32-NEXT: add s0, a0, t6 +; RV32-NEXT: add s0, t6, t2 ; RV32-NEXT: vl2r.v v10, (s1) -; RV32-NEXT: add s1, t6, t2 -; RV32-NEXT: sltu t6, s1, t6 -; RV32-NEXT: add t5, t5, t6 -; RV32-NEXT: xor t6, s1, t4 +; RV32-NEXT: sltu s1, s0, t6 +; RV32-NEXT: add t5, t5, s1 +; RV32-NEXT: add t6, a0, t6 ; RV32-NEXT: vaaddu.vv v8, v8, v10 -; RV32-NEXT: or s2, t6, t5 -; RV32-NEXT: vs2r.v v8, (s0) -; RV32-NEXT: mv t6, s1 -; RV32-NEXT: bnez s2, .LBB0_13 +; RV32-NEXT: vs2r.v v8, (t6) +; RV32-NEXT: xor t6, s0, t4 +; RV32-NEXT: or s1, t6, t5 +; RV32-NEXT: mv t6, s0 +; RV32-NEXT: bnez s1, .LBB0_13 ; RV32-NEXT: # %bb.14: # %middle.block ; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1 ; RV32-NEXT: beq t4, a6, .LBB0_9 @@ -121,27 +119,25 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV32-NEXT: # => This Inner Loop Header: Depth=2 ; RV32-NEXT: add t5, a2, t4 ; RV32-NEXT: add t6, a4, t4 -; RV32-NEXT: add s0, a0, t4 ; RV32-NEXT: lbu t5, 0(t5) ; RV32-NEXT: lbu t6, 0(t6) -; RV32-NEXT: addi t4, t4, 1 -; RV32-NEXT: seqz s1, t4 -; RV32-NEXT: add t3, t3, s1 ; RV32-NEXT: add t5, t5, t6 -; RV32-NEXT: xor t6, t4, a6 +; RV32-NEXT: add t6, a0, t4 +; RV32-NEXT: addi t4, t4, 1 ; RV32-NEXT: addi t5, t5, 1 ; RV32-NEXT: srli t5, t5, 1 -; RV32-NEXT: or t6, t6, t3 -; RV32-NEXT: sb t5, 0(s0) -; RV32-NEXT: bnez t6, .LBB0_15 +; RV32-NEXT: sb t5, 0(t6) +; RV32-NEXT: seqz t5, t4 +; RV32-NEXT: xor t6, t4, a6 +; RV32-NEXT: add t3, t3, t5 +; RV32-NEXT: or t5, t6, t3 +; RV32-NEXT: bnez t5, .LBB0_15 ; RV32-NEXT: j .LBB0_9 ; RV32-NEXT: .LBB0_16: ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: .cfi_restore s1 -; RV32-NEXT: .cfi_restore s2 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: .LBB0_17: # %for.cond.cleanup @@ -436,16 +432,16 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV64-NEXT: add s0, a2, a6 ; RV64-NEXT: add t6, a4, a6 ; RV64-NEXT: csrr t0, vlenb -; RV64-NEXT: li t2, 32 -; RV64-NEXT: slli t1, t1, 32 -; RV64-NEXT: srli t3, t1, 32 -; RV64-NEXT: mul t1, a1, t3 -; RV64-NEXT: add t5, t5, t1 -; RV64-NEXT: mul t1, a3, t3 -; RV64-NEXT: add s0, s0, t1 +; RV64-NEXT: slli t2, t1, 32 ; RV64-NEXT: slli t1, t0, 1 -; RV64-NEXT: mul t3, a5, t3 -; RV64-NEXT: add t6, t6, t3 +; RV64-NEXT: srli t2, t2, 32 +; RV64-NEXT: mul t3, a1, t2 +; RV64-NEXT: add t5, t5, t3 +; RV64-NEXT: mul t3, a3, t2 +; RV64-NEXT: mul t2, a5, t2 +; RV64-NEXT: add s0, s0, t3 +; RV64-NEXT: add t6, t6, t2 +; RV64-NEXT: li t2, 32 ; RV64-NEXT: mv t4, t1 ; RV64-NEXT: bltu t2, t1, .LBB0_4 ; RV64-NEXT: # %bb.3: # %for.cond1.preheader.us.preheader diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll index 72f25268109a1..ce344bd7553fe 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert.ll @@ -393,8 +393,8 @@ define void @test10(ptr nocapture %ptr_dest, ptr nocapture readonly %ptr_op1, pt ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vle8.v v9, (a2) ; CHECK-NEXT: vaadd.vv v8, v8, v9 -; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: bnez a3, .LBB9_2 ; CHECK-NEXT: .LBB9_3: # %for.end ; CHECK-NEXT: ret @@ -432,8 +432,8 @@ define void @test11(ptr nocapture %ptr_dest, ptr nocapture readonly %ptr_op1, pt ; CHECK-NEXT: .LBB10_1: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vaadd.vv v8, v8, v9 -; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: beqz a3, .LBB10_3 ; CHECK-NEXT: # %bb.2: # %for.body ; CHECK-NEXT: # in Loop: Header=BB10_1 Depth=1 diff --git a/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll b/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll index 5872a0995feba..f94c5635032a4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll @@ -8,10 +8,10 @@ define void @do.memmove() nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a0, %hi(c) ; CHECK-NEXT: addi a0, a0, %lo(c) -; CHECK-NEXT: addi a1, a0, 16 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: addi a1, a0, 24 +; CHECK-NEXT: addi a2, a0, 16 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a2) ; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 8 diff --git a/llvm/test/CodeGen/RISCV/scmp.ll b/llvm/test/CodeGen/RISCV/scmp.ll index a212714db53e0..8a0baa67d0293 100644 --- a/llvm/test/CodeGen/RISCV/scmp.ll +++ b/llvm/test/CodeGen/RISCV/scmp.ll @@ -89,15 +89,15 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { ; RV32I-NEXT: lw a2, 4(a1) ; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: lw a5, 12(a1) -; RV32I-NEXT: lw a6, 12(a0) ; RV32I-NEXT: lw a3, 4(a0) -; RV32I-NEXT: lw a7, 8(a0) -; RV32I-NEXT: beq a6, a5, .LBB4_2 +; RV32I-NEXT: lw a6, 8(a0) +; RV32I-NEXT: lw a7, 12(a0) +; RV32I-NEXT: beq a7, a5, .LBB4_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: slt t2, a6, a5 +; RV32I-NEXT: slt t2, a7, a5 ; RV32I-NEXT: j .LBB4_3 ; RV32I-NEXT: .LBB4_2: -; RV32I-NEXT: sltu t2, a7, a4 +; RV32I-NEXT: sltu t2, a6, a4 ; RV32I-NEXT: .LBB4_3: ; RV32I-NEXT: lw a1, 0(a1) ; RV32I-NEXT: lw t0, 0(a0) @@ -108,23 +108,23 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { ; RV32I-NEXT: .LBB4_5: ; RV32I-NEXT: sltu a0, t0, a1 ; RV32I-NEXT: .LBB4_6: -; RV32I-NEXT: xor t1, a6, a5 -; RV32I-NEXT: xor t3, a7, a4 +; RV32I-NEXT: xor t1, a7, a5 +; RV32I-NEXT: xor t3, a6, a4 ; RV32I-NEXT: or t1, t3, t1 ; RV32I-NEXT: beqz t1, .LBB4_8 ; RV32I-NEXT: # %bb.7: ; RV32I-NEXT: mv a0, t2 ; RV32I-NEXT: .LBB4_8: -; RV32I-NEXT: beq a6, a5, .LBB4_11 +; RV32I-NEXT: beq a7, a5, .LBB4_11 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: slt a4, a5, a6 +; RV32I-NEXT: slt a4, a5, a7 ; RV32I-NEXT: bne a3, a2, .LBB4_12 ; RV32I-NEXT: .LBB4_10: ; RV32I-NEXT: sltu a1, a1, t0 ; RV32I-NEXT: bnez t1, .LBB4_13 ; RV32I-NEXT: j .LBB4_14 ; RV32I-NEXT: .LBB4_11: -; RV32I-NEXT: sltu a4, a4, a7 +; RV32I-NEXT: sltu a4, a4, a6 ; RV32I-NEXT: beq a3, a2, .LBB4_10 ; RV32I-NEXT: .LBB4_12: ; RV32I-NEXT: sltu a1, a2, a3 diff --git a/llvm/test/CodeGen/RISCV/select-and.ll b/llvm/test/CodeGen/RISCV/select-and.ll index f827e840f4a36..01965a2da23f8 100644 --- a/llvm/test/CodeGen/RISCV/select-and.ll +++ b/llvm/test/CodeGen/RISCV/select-and.ll @@ -12,22 +12,22 @@ define signext i32 @select_of_and(i1 zeroext %a, i1 zeroext %b, i32 signext %c, i32 signext %d) nounwind { ; RV32I-LABEL: select_of_and: ; RV32I: # %bb.0: -; RV32I-NEXT: and a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a1, .LBB0_2 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: bnez a0, .LBB0_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: mv a2, a3 ; RV32I-NEXT: .LBB0_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: select_of_and: ; RV64I: # %bb.0: -; RV64I-NEXT: and a1, a0, a1 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: bnez a1, .LBB0_2 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: bnez a0, .LBB0_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: mv a0, a3 +; RV64I-NEXT: mv a2, a3 ; RV64I-NEXT: .LBB0_2: +; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: ret ; ; RV64I-CCMOV-LABEL: select_of_and: diff --git a/llvm/test/CodeGen/RISCV/select-bare.ll b/llvm/test/CodeGen/RISCV/select-bare.ll index c9e108a1ca9d0..ab03b1a684730 100644 --- a/llvm/test/CodeGen/RISCV/select-bare.ll +++ b/llvm/test/CodeGen/RISCV/select-bare.ll @@ -7,12 +7,12 @@ define i32 @bare_select(i1 %a, i32 %b, i32 %c) nounwind { ; RV32I-LABEL: bare_select: ; RV32I: # %bb.0: -; RV32I-NEXT: andi a3, a0, 1 -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: bnez a3, .LBB0_2 +; RV32I-NEXT: andi a0, a0, 1 +; RV32I-NEXT: bnez a0, .LBB0_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: mv a1, a2 ; RV32I-NEXT: .LBB0_2: +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: ret ; ; RV64I-CCMOV-LABEL: bare_select: @@ -27,12 +27,12 @@ define i32 @bare_select(i1 %a, i32 %b, i32 %c) nounwind { define float @bare_select_float(i1 %a, float %b, float %c) nounwind { ; RV32I-LABEL: bare_select_float: ; RV32I: # %bb.0: -; RV32I-NEXT: andi a3, a0, 1 -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: bnez a3, .LBB1_2 +; RV32I-NEXT: andi a0, a0, 1 +; RV32I-NEXT: bnez a0, .LBB1_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: mv a1, a2 ; RV32I-NEXT: .LBB1_2: +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: ret ; ; RV64I-CCMOV-LABEL: bare_select_float: diff --git a/llvm/test/CodeGen/RISCV/select-cc.ll b/llvm/test/CodeGen/RISCV/select-cc.ll index 1c2a0cf007d11..568fea4df4acc 100644 --- a/llvm/test/CodeGen/RISCV/select-cc.ll +++ b/llvm/test/CodeGen/RISCV/select-cc.ll @@ -163,48 +163,48 @@ define signext i32 @foo(i32 signext %a, ptr %b) nounwind { ; RV64I-CCMOV: # %bb.0: ; RV64I-CCMOV-NEXT: lw a2, 0(a1) ; RV64I-CCMOV-NEXT: lw a3, 0(a1) -; RV64I-CCMOV-NEXT: lw a4, 0(a1) -; RV64I-CCMOV-NEXT: lw a5, 0(a1) -; RV64I-CCMOV-NEXT: xor a6, a0, a2 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a6, a2, a0 -; RV64I-CCMOV-NEXT: xor a2, a0, a3 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a2, a0, a3 +; RV64I-CCMOV-NEXT: xor a4, a0, a2 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a2, a0 ; RV64I-CCMOV-NEXT: lw a2, 0(a1) -; RV64I-CCMOV-NEXT: sltu a3, a4, a0 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a3, a0, a4 +; RV64I-CCMOV-NEXT: xor a4, a0, a3 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a0, a3 ; RV64I-CCMOV-NEXT: lw a3, 0(a1) -; RV64I-CCMOV-NEXT: sltu a4, a0, a5 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a5, a0 -; RV64I-CCMOV-NEXT: lw a4, 0(a1) -; RV64I-CCMOV-NEXT: sltu a5, a0, a2 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a5, a0, a2 +; RV64I-CCMOV-NEXT: sltu a4, a2, a0 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a0, a2 ; RV64I-CCMOV-NEXT: lw a2, 0(a1) -; RV64I-CCMOV-NEXT: sltu a5, a3, a0 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a5, a3, a0 +; RV64I-CCMOV-NEXT: sltu a4, a0, a3 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a3, a0 ; RV64I-CCMOV-NEXT: lw a3, 0(a1) -; RV64I-CCMOV-NEXT: sext.w a5, a0 -; RV64I-CCMOV-NEXT: slt a5, a4, a5 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a5, a0, a4 -; RV64I-CCMOV-NEXT: lw a4, 0(a1) -; RV64I-CCMOV-NEXT: sext.w a5, a0 -; RV64I-CCMOV-NEXT: slt a5, a5, a2 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a5, a2, a0 +; RV64I-CCMOV-NEXT: sltu a4, a0, a2 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a0, a2 ; RV64I-CCMOV-NEXT: lw a2, 0(a1) -; RV64I-CCMOV-NEXT: sext.w a5, a0 -; RV64I-CCMOV-NEXT: slt a5, a5, a3 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a5, a0, a3 +; RV64I-CCMOV-NEXT: sltu a4, a3, a0 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a3, a0 +; RV64I-CCMOV-NEXT: lw a3, 0(a1) +; RV64I-CCMOV-NEXT: sext.w a4, a0 +; RV64I-CCMOV-NEXT: slt a4, a2, a4 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a0, a2 +; RV64I-CCMOV-NEXT: lw a2, 0(a1) +; RV64I-CCMOV-NEXT: sext.w a4, a0 +; RV64I-CCMOV-NEXT: slt a4, a4, a3 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a3, a0 +; RV64I-CCMOV-NEXT: lw a3, 0(a1) +; RV64I-CCMOV-NEXT: sext.w a4, a0 +; RV64I-CCMOV-NEXT: slt a4, a4, a2 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a0, a2 +; RV64I-CCMOV-NEXT: lw a2, 0(a1) +; RV64I-CCMOV-NEXT: sext.w a4, a0 +; RV64I-CCMOV-NEXT: slt a4, a3, a4 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a3, a0 +; RV64I-CCMOV-NEXT: lw a3, 0(a1) +; RV64I-CCMOV-NEXT: slti a4, a2, 1 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a0, a2 +; RV64I-CCMOV-NEXT: slti a4, a2, 0 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a3, a0 ; RV64I-CCMOV-NEXT: lw a3, 0(a1) -; RV64I-CCMOV-NEXT: sext.w a5, a0 -; RV64I-CCMOV-NEXT: slt a5, a4, a5 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a5, a4, a0 -; RV64I-CCMOV-NEXT: lw a4, 0(a1) -; RV64I-CCMOV-NEXT: slti a5, a2, 1 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a5, a0, a2 -; RV64I-CCMOV-NEXT: slti a5, a2, 0 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a5, a3, a0 ; RV64I-CCMOV-NEXT: lw a1, 0(a1) -; RV64I-CCMOV-NEXT: slti a3, a4, 1025 -; RV64I-CCMOV-NEXT: mips.ccmov a0, a3, a4, a0 +; RV64I-CCMOV-NEXT: slti a4, a3, 1025 +; RV64I-CCMOV-NEXT: mips.ccmov a0, a4, a3, a0 ; RV64I-CCMOV-NEXT: sltiu a2, a2, 2047 ; RV64I-CCMOV-NEXT: mips.ccmov a0, a2, a1, a0 ; RV64I-CCMOV-NEXT: sext.w a0, a0 diff --git a/llvm/test/CodeGen/RISCV/select-constant-xor.ll b/llvm/test/CodeGen/RISCV/select-constant-xor.ll index 2e26ae78e2dd8..254ff96ef5648 100644 --- a/llvm/test/CodeGen/RISCV/select-constant-xor.ll +++ b/llvm/test/CodeGen/RISCV/select-constant-xor.ll @@ -172,12 +172,12 @@ define i32 @icmpasreq(i32 %input, i32 %a, i32 %b) { ; ; RV64-LABEL: icmpasreq: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a3, a0 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB8_2 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: bltz a0, .LBB8_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB8_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %sh = ashr i32 %input, 31 %c = icmp eq i32 %sh, -1 @@ -197,12 +197,12 @@ define i32 @icmpasrne(i32 %input, i32 %a, i32 %b) { ; ; RV64-LABEL: icmpasrne: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a3, a0 -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bgez a3, .LBB9_2 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: bgez a0, .LBB9_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a1, a2 ; RV64-NEXT: .LBB9_2: +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret %sh = ashr i32 %input, 31 %c = icmp ne i32 %sh, -1 diff --git a/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll b/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll index 005a01bf1000a..3020e61fd6985 100644 --- a/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll +++ b/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll @@ -96,24 +96,24 @@ entry: define i64 @cmov64(i1 %a, i64 %b, i64 %c) nounwind { ; RV32I-LABEL: cmov64: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: andi a5, a0, 1 -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: bnez a5, .LBB2_2 +; RV32I-NEXT: andi a0, a0, 1 +; RV32I-NEXT: bnez a0, .LBB2_2 ; RV32I-NEXT: # %bb.1: # %entry -; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: .LBB2_2: # %entry +; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: mv a1, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: cmov64: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: andi a3, a0, 1 -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: bnez a3, .LBB2_2 +; RV64I-NEXT: andi a0, a0, 1 +; RV64I-NEXT: bnez a0, .LBB2_2 ; RV64I-NEXT: # %bb.1: # %entry -; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: mv a1, a2 ; RV64I-NEXT: .LBB2_2: # %entry +; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ret entry: %cond = select i1 %a, i64 %b, i64 %c @@ -161,13 +161,13 @@ define i128 @cmov128(i1 %a, i128 %b, i128 %c) nounwind { ; ; RV64I-LABEL: cmov128: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: andi a5, a0, 1 -; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: bnez a5, .LBB3_2 +; RV64I-NEXT: andi a0, a0, 1 +; RV64I-NEXT: bnez a0, .LBB3_2 ; RV64I-NEXT: # %bb.1: # %entry -; RV64I-NEXT: mv a0, a3 +; RV64I-NEXT: mv a1, a3 ; RV64I-NEXT: mv a2, a4 ; RV64I-NEXT: .LBB3_2: # %entry +; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: mv a1, a2 ; RV64I-NEXT: ret entry: @@ -221,9 +221,9 @@ define double @cmovdouble(i1 %a, double %b, double %c) nounwind { ; RV32I-NEXT: sw a3, 8(sp) ; RV32I-NEXT: sw a4, 12(sp) ; RV32I-NEXT: fld fa5, 8(sp) -; RV32I-NEXT: andi a0, a0, 1 ; RV32I-NEXT: sw a1, 8(sp) ; RV32I-NEXT: sw a2, 12(sp) +; RV32I-NEXT: andi a0, a0, 1 ; RV32I-NEXT: beqz a0, .LBB5_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: fld fa5, 8(sp) @@ -301,8 +301,8 @@ entry: define i32 @cmovdiffcc(i1 %a, i1 %b, i32 %c, i32 %d, i32 %e, i32 %f) nounwind { ; RV32I-LABEL: cmovdiffcc: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: andi a0, a0, 1 ; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: andi a0, a0, 1 ; RV32I-NEXT: beqz a0, .LBB7_3 ; RV32I-NEXT: # %bb.1: # %entry ; RV32I-NEXT: beqz a1, .LBB7_4 @@ -318,8 +318,8 @@ define i32 @cmovdiffcc(i1 %a, i1 %b, i32 %c, i32 %d, i32 %e, i32 %f) nounwind { ; ; RV64I-LABEL: cmovdiffcc: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: andi a0, a0, 1 ; RV64I-NEXT: andi a1, a1, 1 +; RV64I-NEXT: andi a0, a0, 1 ; RV64I-NEXT: beqz a0, .LBB7_3 ; RV64I-NEXT: # %bb.1: # %entry ; RV64I-NEXT: beqz a1, .LBB7_4 diff --git a/llvm/test/CodeGen/RISCV/select-or.ll b/llvm/test/CodeGen/RISCV/select-or.ll index 338c7c06c3ab8..b1ed06ad5b8cf 100644 --- a/llvm/test/CodeGen/RISCV/select-or.ll +++ b/llvm/test/CodeGen/RISCV/select-or.ll @@ -12,22 +12,22 @@ define signext i32 @select_of_or(i1 zeroext %a, i1 zeroext %b, i32 signext %c, i32 signext %d) nounwind { ; RV32I-LABEL: select_of_or: ; RV32I: # %bb.0: -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a2 -; RV32I-NEXT: bnez a1, .LBB0_2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: bnez a0, .LBB0_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: mv a2, a3 ; RV32I-NEXT: .LBB0_2: +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: select_of_or: ; RV64I: # %bb.0: -; RV64I-NEXT: or a1, a0, a1 -; RV64I-NEXT: mv a0, a2 -; RV64I-NEXT: bnez a1, .LBB0_2 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: bnez a0, .LBB0_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: mv a0, a3 +; RV64I-NEXT: mv a2, a3 ; RV64I-NEXT: .LBB0_2: +; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: ret ; ; RV64I-CCMOV-LABEL: select_of_or: diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll index e0a16aa05cd00..cb8fddd71e08c 100644 --- a/llvm/test/CodeGen/RISCV/sextw-removal.ll +++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll @@ -269,8 +269,8 @@ define void @test6(i32 signext %arg, i32 signext %arg1) nounwind { ; CHECK-NEXT: .LBB5_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: call baz -; CHECK-NEXT: feq.s a1, fa0, fs0 ; CHECK-NEXT: fcvt.w.s a0, fa0, rtz +; CHECK-NEXT: feq.s a1, fa0, fs0 ; CHECK-NEXT: beqz a1, .LBB5_1 ; CHECK-NEXT: # %bb.2: # %bb7 ; CHECK-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -289,8 +289,8 @@ define void @test6(i32 signext %arg, i32 signext %arg1) nounwind { ; NOREMOVAL-NEXT: # =>This Inner Loop Header: Depth=1 ; NOREMOVAL-NEXT: sext.w a0, a0 ; NOREMOVAL-NEXT: call baz -; NOREMOVAL-NEXT: feq.s a1, fa0, fs0 ; NOREMOVAL-NEXT: fcvt.w.s a0, fa0, rtz +; NOREMOVAL-NEXT: feq.s a1, fa0, fs0 ; NOREMOVAL-NEXT: beqz a1, .LBB5_1 ; NOREMOVAL-NEXT: # %bb.2: # %bb7 ; NOREMOVAL-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -526,8 +526,8 @@ define void @test10(i32 signext %arg, i32 signext %arg1) nounwind { ; CHECK-NEXT: .LBB9_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: call baz -; CHECK-NEXT: feq.s a1, fa0, fs0 ; CHECK-NEXT: fmv.x.w a0, fa0 +; CHECK-NEXT: feq.s a1, fa0, fs0 ; CHECK-NEXT: beqz a1, .LBB9_1 ; CHECK-NEXT: # %bb.2: # %bb7 ; CHECK-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -546,8 +546,8 @@ define void @test10(i32 signext %arg, i32 signext %arg1) nounwind { ; NOREMOVAL-NEXT: # =>This Inner Loop Header: Depth=1 ; NOREMOVAL-NEXT: sext.w a0, a0 ; NOREMOVAL-NEXT: call baz -; NOREMOVAL-NEXT: feq.s a1, fa0, fs0 ; NOREMOVAL-NEXT: fmv.x.w a0, fa0 +; NOREMOVAL-NEXT: feq.s a1, fa0, fs0 ; NOREMOVAL-NEXT: beqz a1, .LBB9_1 ; NOREMOVAL-NEXT: # %bb.2: # %bb7 ; NOREMOVAL-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -578,8 +578,8 @@ define signext i32 @test11(i64 %arg1, i64 %arg2, i64 %arg3) { ; CHECK-NEXT: .LBB10_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: andi a0, a0, 1234 -; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addw a0, a0, a1 +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: bltu a2, a3, .LBB10_1 ; CHECK-NEXT: # %bb.2: # %bb7 ; CHECK-NEXT: ret @@ -591,8 +591,8 @@ define signext i32 @test11(i64 %arg1, i64 %arg2, i64 %arg3) { ; NOREMOVAL-NEXT: .LBB10_1: # %bb2 ; NOREMOVAL-NEXT: # =>This Inner Loop Header: Depth=1 ; NOREMOVAL-NEXT: andi a0, a0, 1234 -; NOREMOVAL-NEXT: addi a2, a2, 1 ; NOREMOVAL-NEXT: add a0, a0, a1 +; NOREMOVAL-NEXT: addi a2, a2, 1 ; NOREMOVAL-NEXT: bltu a2, a3, .LBB10_1 ; NOREMOVAL-NEXT: # %bb.2: # %bb7 ; NOREMOVAL-NEXT: sext.w a0, a0 @@ -626,8 +626,8 @@ define signext i32 @test12(i64 %arg1, i64 %arg2, i64 %arg3) { ; CHECK-NEXT: mulw a2, a0, a1 ; CHECK-NEXT: addw a0, a0, a2 ; CHECK-NEXT: and a2, a2, a0 -; CHECK-NEXT: addi a3, a3, 1 ; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: addi a3, a3, 1 ; CHECK-NEXT: bltu a3, a4, .LBB11_1 ; CHECK-NEXT: # %bb.2: # %bb7 ; CHECK-NEXT: mv a0, a2 @@ -643,8 +643,8 @@ define signext i32 @test12(i64 %arg1, i64 %arg2, i64 %arg3) { ; NOREMOVAL-NEXT: mul a4, a0, a1 ; NOREMOVAL-NEXT: add a0, a0, a4 ; NOREMOVAL-NEXT: and a4, a4, a0 -; NOREMOVAL-NEXT: addi a2, a2, 1 ; NOREMOVAL-NEXT: add a0, a4, a1 +; NOREMOVAL-NEXT: addi a2, a2, 1 ; NOREMOVAL-NEXT: bltu a2, a3, .LBB11_1 ; NOREMOVAL-NEXT: # %bb.2: # %bb7 ; NOREMOVAL-NEXT: sext.w a0, a4 @@ -678,8 +678,8 @@ define signext i32 @test13(i64 %arg1, i64 %arg2, i64 %arg3) { ; CHECK-NEXT: .LBB12_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: div a0, a0, a1 -; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: bltu a2, a3, .LBB12_1 ; CHECK-NEXT: # %bb.2: # %bb7 ; CHECK-NEXT: sext.w a0, a0 @@ -692,8 +692,8 @@ define signext i32 @test13(i64 %arg1, i64 %arg2, i64 %arg3) { ; NOREMOVAL-NEXT: .LBB12_1: # %bb2 ; NOREMOVAL-NEXT: # =>This Inner Loop Header: Depth=1 ; NOREMOVAL-NEXT: div a0, a0, a1 -; NOREMOVAL-NEXT: addi a2, a2, 1 ; NOREMOVAL-NEXT: add a0, a0, a1 +; NOREMOVAL-NEXT: addi a2, a2, 1 ; NOREMOVAL-NEXT: bltu a2, a3, .LBB12_1 ; NOREMOVAL-NEXT: # %bb.2: # %bb7 ; NOREMOVAL-NEXT: sext.w a0, a0 @@ -989,8 +989,8 @@ define signext i32 @test15(i64 %arg1, i64 %arg2, i64 %arg3, ptr %arg4) { ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: andi a0, a0, 1234 ; CHECK-NEXT: addw a0, a0, a1 -; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: sw a0, 0(a3) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: bltu a2, a4, .LBB17_1 ; CHECK-NEXT: # %bb.2: # %bb7 ; CHECK-NEXT: ret @@ -1003,8 +1003,8 @@ define signext i32 @test15(i64 %arg1, i64 %arg2, i64 %arg3, ptr %arg4) { ; NOREMOVAL-NEXT: # =>This Inner Loop Header: Depth=1 ; NOREMOVAL-NEXT: andi a0, a0, 1234 ; NOREMOVAL-NEXT: add a0, a0, a1 -; NOREMOVAL-NEXT: addi a2, a2, 1 ; NOREMOVAL-NEXT: sw a0, 0(a3) +; NOREMOVAL-NEXT: addi a2, a2, 1 ; NOREMOVAL-NEXT: bltu a2, a4, .LBB17_1 ; NOREMOVAL-NEXT: # %bb.2: # %bb7 ; NOREMOVAL-NEXT: sext.w a0, a0 diff --git a/llvm/test/CodeGen/RISCV/shift-amount-mod.ll b/llvm/test/CodeGen/RISCV/shift-amount-mod.ll index 1e893d9baa494..40806c5ecdf48 100644 --- a/llvm/test/CodeGen/RISCV/shift-amount-mod.ll +++ b/llvm/test/CodeGen/RISCV/shift-amount-mod.ll @@ -141,10 +141,9 @@ define i64 @ashr_by_complemented_64(i64 %x) { ; RV32I-NEXT: sub a4, a4, a2 ; RV32I-NEXT: not a2, a4 ; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: sll a1, a1, a2 -; RV32I-NEXT: or a3, a3, a1 +; RV32I-NEXT: sll a2, a1, a2 ; RV32I-NEXT: mv a1, a0 -; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: or a0, a3, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: ashr_by_complemented_64: @@ -178,25 +177,25 @@ define i32 @shl_by_masked_complemented_32(i32 %x) { define i64 @shl_by_masked_complemented_64(i64 %x) { ; RV32I-LABEL: shl_by_masked_complemented_64: ; RV32I: # %bb.0: -; RV32I-NEXT: li a2, 63 -; RV32I-NEXT: sub a2, a2, a0 -; RV32I-NEXT: andi a4, a2, 63 -; RV32I-NEXT: addi a2, a4, -32 -; RV32I-NEXT: not a3, a0 -; RV32I-NEXT: bltz a2, .LBB7_2 +; RV32I-NEXT: not a2, a0 +; RV32I-NEXT: li a3, 63 +; RV32I-NEXT: sub a3, a3, a0 +; RV32I-NEXT: andi a4, a3, 63 +; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: bltz a3, .LBB7_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sll a1, a0, a4 ; RV32I-NEXT: j .LBB7_3 ; RV32I-NEXT: .LBB7_2: -; RV32I-NEXT: sll a1, a1, a3 +; RV32I-NEXT: sll a1, a1, a2 ; RV32I-NEXT: not a4, a4 ; RV32I-NEXT: srli a5, a0, 1 ; RV32I-NEXT: srl a4, a5, a4 ; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: .LBB7_3: -; RV32I-NEXT: sll a0, a0, a3 -; RV32I-NEXT: srai a2, a2, 31 -; RV32I-NEXT: and a0, a2, a0 +; RV32I-NEXT: sll a0, a0, a2 +; RV32I-NEXT: srai a3, a3, 31 +; RV32I-NEXT: and a0, a3, a0 ; RV32I-NEXT: ret ; ; RV64I-LABEL: shl_by_masked_complemented_64: @@ -213,25 +212,25 @@ define i64 @shl_by_masked_complemented_64(i64 %x) { define i64 @lshr_by_masked_complemented_64(i64 %x) { ; RV32I-LABEL: lshr_by_masked_complemented_64: ; RV32I: # %bb.0: -; RV32I-NEXT: li a2, 63 -; RV32I-NEXT: sub a2, a2, a0 -; RV32I-NEXT: andi a4, a2, 63 -; RV32I-NEXT: addi a2, a4, -32 -; RV32I-NEXT: not a3, a0 -; RV32I-NEXT: bltz a2, .LBB8_2 +; RV32I-NEXT: not a2, a0 +; RV32I-NEXT: li a3, 63 +; RV32I-NEXT: sub a3, a3, a0 +; RV32I-NEXT: andi a4, a3, 63 +; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: bltz a3, .LBB8_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srl a0, a1, a4 ; RV32I-NEXT: j .LBB8_3 ; RV32I-NEXT: .LBB8_2: -; RV32I-NEXT: srl a0, a0, a3 +; RV32I-NEXT: srl a0, a0, a2 ; RV32I-NEXT: not a4, a4 ; RV32I-NEXT: slli a5, a1, 1 ; RV32I-NEXT: sll a4, a5, a4 ; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: .LBB8_3: -; RV32I-NEXT: srl a1, a1, a3 -; RV32I-NEXT: srai a2, a2, 31 -; RV32I-NEXT: and a1, a2, a1 +; RV32I-NEXT: srl a1, a1, a2 +; RV32I-NEXT: srai a3, a3, 31 +; RV32I-NEXT: and a1, a3, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: lshr_by_masked_complemented_64: @@ -250,22 +249,23 @@ define i64 @ashr_by_masked_complemented_64(i64 %x) { ; RV32I: # %bb.0: ; RV32I-NEXT: li a2, 63 ; RV32I-NEXT: sub a2, a2, a0 -; RV32I-NEXT: andi a2, a2, 63 -; RV32I-NEXT: addi a3, a2, -32 -; RV32I-NEXT: bltz a3, .LBB9_2 +; RV32I-NEXT: andi a3, a2, 63 +; RV32I-NEXT: addi a2, a3, -32 +; RV32I-NEXT: bltz a2, .LBB9_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: srai a1, a1, 31 -; RV32I-NEXT: sra a0, a0, a2 +; RV32I-NEXT: srai a2, a1, 31 +; RV32I-NEXT: sra a0, a1, a3 +; RV32I-NEXT: mv a1, a2 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: not a3, a0 -; RV32I-NEXT: not a2, a2 -; RV32I-NEXT: slli a4, a1, 1 -; RV32I-NEXT: sra a1, a1, a3 -; RV32I-NEXT: srl a0, a0, a3 -; RV32I-NEXT: sll a2, a4, a2 -; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: not a4, a0 +; RV32I-NEXT: not a3, a3 +; RV32I-NEXT: slli a5, a1, 1 +; RV32I-NEXT: sra a2, a1, a4 +; RV32I-NEXT: srl a0, a0, a4 +; RV32I-NEXT: sll a1, a5, a3 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: mv a1, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: ashr_by_masked_complemented_64: diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll index 249dabba0cc28..fcf34b5612689 100644 --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -13,8 +13,8 @@ declare i128 @llvm.fshr.i128(i128, i128, i128) define i64 @lshr64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: lshr64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: srl a3, a1, a2 +; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: bltz a4, .LBB0_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a3 @@ -60,13 +60,12 @@ define i64 @ashr64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: ashr64: ; RV32I: # %bb.0: ; RV32I-NEXT: mv a3, a1 -; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: sra a1, a1, a2 +; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: bltz a4, .LBB2_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a3, a3, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: srai a1, a3, 31 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB2_2: ; RV32I-NEXT: srl a0, a0, a2 @@ -105,8 +104,8 @@ define i64 @ashr64_minsize(i64 %a, i64 %b) minsize nounwind { define i64 @shl64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: shl64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: sll a3, a0, a2 +; RV32I-NEXT: addi a4, a2, -32 ; RV32I-NEXT: bltz a4, .LBB4_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a1, a3 @@ -197,8 +196,8 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind { ; ; RV64I-LABEL: lshr128: ; RV64I: # %bb.0: -; RV64I-NEXT: addi a4, a2, -64 ; RV64I-NEXT: srl a3, a1, a2 +; RV64I-NEXT: addi a4, a2, -64 ; RV64I-NEXT: bltz a4, .LBB6_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a3 @@ -268,13 +267,12 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind { ; RV64I-LABEL: ashr128: ; RV64I: # %bb.0: ; RV64I-NEXT: mv a3, a1 -; RV64I-NEXT: addi a4, a2, -64 ; RV64I-NEXT: sra a1, a1, a2 +; RV64I-NEXT: addi a4, a2, -64 ; RV64I-NEXT: bltz a4, .LBB7_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: srai a3, a3, 63 ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: srai a1, a3, 63 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB7_2: ; RV64I-NEXT: srl a0, a0, a2 @@ -308,12 +306,12 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: srli a1, a2, 3 ; RV32I-NEXT: andi a3, a2, 31 ; RV32I-NEXT: andi a1, a1, 12 +; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: sub a1, a6, a1 ; RV32I-NEXT: lw a4, 0(a1) ; RV32I-NEXT: lw a5, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: sll a7, a5, a2 ; RV32I-NEXT: srli t0, a4, 1 ; RV32I-NEXT: sll a1, a1, a2 @@ -336,8 +334,8 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; ; RV64I-LABEL: shl128: ; RV64I: # %bb.0: -; RV64I-NEXT: addi a4, a2, -64 ; RV64I-NEXT: sll a3, a0, a2 +; RV64I-NEXT: addi a4, a2, -64 ; RV64I-NEXT: bltz a4, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a1, a3 @@ -394,21 +392,21 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV32I-LABEL: fshr128_minsize: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a2, 0(a2) ; RV32I-NEXT: lw t1, 0(a1) ; RV32I-NEXT: lw a7, 4(a1) ; RV32I-NEXT: lw a4, 8(a1) -; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: andi t2, a2, 64 +; RV32I-NEXT: lw a3, 12(a1) +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: andi t2, a1, 64 ; RV32I-NEXT: mv t0, a7 -; RV32I-NEXT: mv a3, t1 +; RV32I-NEXT: mv a2, t1 ; RV32I-NEXT: beqz t2, .LBB10_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv t0, a1 -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: mv t0, a3 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: .LBB10_2: -; RV32I-NEXT: andi a6, a2, 32 -; RV32I-NEXT: mv a5, a3 +; RV32I-NEXT: andi a6, a1, 32 +; RV32I-NEXT: mv a5, a2 ; RV32I-NEXT: bnez a6, .LBB10_13 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: bnez t2, .LBB10_14 @@ -418,31 +416,31 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV32I-NEXT: mv t0, a4 ; RV32I-NEXT: .LBB10_6: ; RV32I-NEXT: slli t3, t0, 1 -; RV32I-NEXT: not t1, a2 +; RV32I-NEXT: not t1, a1 ; RV32I-NEXT: beqz t2, .LBB10_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv a1, a7 +; RV32I-NEXT: mv a3, a7 ; RV32I-NEXT: .LBB10_8: -; RV32I-NEXT: srl a7, a5, a2 +; RV32I-NEXT: srl a7, a5, a1 ; RV32I-NEXT: sll t2, t3, t1 -; RV32I-NEXT: srl t0, t0, a2 +; RV32I-NEXT: srl t0, t0, a1 ; RV32I-NEXT: beqz a6, .LBB10_10 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: mv a4, a1 +; RV32I-NEXT: mv a4, a3 ; RV32I-NEXT: .LBB10_10: ; RV32I-NEXT: or a7, t2, a7 ; RV32I-NEXT: slli t2, a4, 1 ; RV32I-NEXT: sll t2, t2, t1 ; RV32I-NEXT: or t0, t2, t0 -; RV32I-NEXT: srl a4, a4, a2 +; RV32I-NEXT: srl a4, a4, a1 ; RV32I-NEXT: beqz a6, .LBB10_12 ; RV32I-NEXT: # %bb.11: -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: mv a3, a2 ; RV32I-NEXT: .LBB10_12: -; RV32I-NEXT: slli a3, a1, 1 -; RV32I-NEXT: srl a1, a1, a2 +; RV32I-NEXT: slli a2, a3, 1 +; RV32I-NEXT: srl a1, a3, a1 ; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: sll a2, a3, t1 +; RV32I-NEXT: sll a2, a2, t1 ; RV32I-NEXT: sll a3, a5, t1 ; RV32I-NEXT: or a2, a2, a4 ; RV32I-NEXT: or a1, a3, a1 diff --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll index 500673cc29196..f408011b31456 100644 --- a/llvm/test/CodeGen/RISCV/shl-cttz.ll +++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll @@ -415,20 +415,20 @@ define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) { ; RV32I-NEXT: .cfi_offset ra, -4 ; RV32I-NEXT: .cfi_offset s0, -8 ; RV32I-NEXT: .cfi_offset s1, -12 -; RV32I-NEXT: neg a2, a1 -; RV32I-NEXT: and a1, a1, a2 -; RV32I-NEXT: lui a2, 30667 -; RV32I-NEXT: addi a2, a2, 1329 -; RV32I-NEXT: mul a1, a1, a2 -; RV32I-NEXT: srli a1, a1, 27 -; RV32I-NEXT: lui a2, %hi(.LCPI7_0) -; RV32I-NEXT: addi a2, a2, %lo(.LCPI7_0) -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: lbu s0, 0(a1) -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: neg a0, a1 +; RV32I-NEXT: and a0, a1, a0 +; RV32I-NEXT: lui a1, 30667 +; RV32I-NEXT: addi a1, a1, 1329 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 27 +; RV32I-NEXT: lui a1, %hi(.LCPI7_0) +; RV32I-NEXT: addi a1, a1, %lo(.LCPI7_0) +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: lbu s1, 0(a0) +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call use32 -; RV32I-NEXT: sll a0, s1, s0 +; RV32I-NEXT: sll a0, s0, s1 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload @@ -474,20 +474,20 @@ define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) { ; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 -; RV64I-NEXT: negw a2, a1 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: lui a2, 30667 -; RV64I-NEXT: addi a2, a2, 1329 -; RV64I-NEXT: mul a1, a1, a2 -; RV64I-NEXT: srliw a1, a1, 27 -; RV64I-NEXT: lui a2, %hi(.LCPI7_0) -; RV64I-NEXT: addi a2, a2, %lo(.LCPI7_0) -; RV64I-NEXT: add a1, a2, a1 -; RV64I-NEXT: lbu s0, 0(a1) -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: negw a0, a1 +; RV64I-NEXT: and a0, a1, a0 +; RV64I-NEXT: lui a1, 30667 +; RV64I-NEXT: addi a1, a1, 1329 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: srliw a0, a0, 27 +; RV64I-NEXT: lui a1, %hi(.LCPI7_0) +; RV64I-NEXT: addi a1, a1, %lo(.LCPI7_0) +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lbu s1, 0(a0) +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call use32 -; RV64I-NEXT: sllw a0, s1, s0 +; RV64I-NEXT: sllw a0, s0, s1 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -554,8 +554,8 @@ define i64 @shl_cttz_i64(i64 %x, i64 %y) { ; RV32I-NEXT: add a2, a4, a2 ; RV32I-NEXT: lbu a4, 0(a2) ; RV32I-NEXT: .LBB8_3: # %entry -; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: sll a2, a0, a4 +; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: bltz a3, .LBB8_5 ; RV32I-NEXT: # %bb.4: # %entry ; RV32I-NEXT: mv a1, a2 @@ -581,8 +581,8 @@ define i64 @shl_cttz_i64(i64 %x, i64 %y) { ; RV32ZBB-NEXT: .LBB8_2: ; RV32ZBB-NEXT: ctz a4, a2 ; RV32ZBB-NEXT: .LBB8_3: # %entry -; RV32ZBB-NEXT: addi a3, a4, -32 ; RV32ZBB-NEXT: sll a2, a0, a4 +; RV32ZBB-NEXT: addi a3, a4, -32 ; RV32ZBB-NEXT: bltz a3, .LBB8_5 ; RV32ZBB-NEXT: # %bb.4: # %entry ; RV32ZBB-NEXT: mv a1, a2 @@ -642,8 +642,8 @@ define i64 @shl_cttz_constant_i64(i64 %y) { ; RV32I-NEXT: lbu a1, 0(a0) ; RV32I-NEXT: .LBB9_3: # %entry ; RV32I-NEXT: li a0, 4 -; RV32I-NEXT: addi a2, a1, -32 ; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: addi a2, a1, -32 ; RV32I-NEXT: bltz a2, .LBB9_5 ; RV32I-NEXT: # %bb.4: # %entry ; RV32I-NEXT: mv a1, a0 @@ -668,8 +668,8 @@ define i64 @shl_cttz_constant_i64(i64 %y) { ; RV32ZBB-NEXT: ctz a1, a0 ; RV32ZBB-NEXT: .LBB9_3: # %entry ; RV32ZBB-NEXT: li a0, 4 -; RV32ZBB-NEXT: addi a2, a1, -32 ; RV32ZBB-NEXT: sll a0, a0, a1 +; RV32ZBB-NEXT: addi a2, a1, -32 ; RV32ZBB-NEXT: bltz a2, .LBB9_5 ; RV32ZBB-NEXT: # %bb.4: # %entry ; RV32ZBB-NEXT: mv a1, a0 diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll index 8f5b044c3b3b8..6d14c0d76a45c 100644 --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -56,10 +56,10 @@ define void @test2(ptr %sp, ptr %t, i32 %n) { ; RV32I-LABEL: test2: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: li a3, 0 -; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: lui a4, 20 ; RV32I-NEXT: addi a4, a4, -1920 ; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: add a0, a0, a4 ; RV32I-NEXT: blez a2, .LBB1_2 ; RV32I-NEXT: .LBB1_1: # %while_body @@ -77,8 +77,8 @@ define void @test2(ptr %sp, ptr %t, i32 %n) { ; RV64I-LABEL: test2: ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: li a3, 0 -; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: lui a4, 20 +; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: addiw a4, a4, -1920 ; RV64I-NEXT: add a1, a1, a4 ; RV64I-NEXT: add a0, a0, a4 diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll index 42c87c9660dc9..e3aeae4df2be1 100644 --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -304,24 +304,24 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s6, 0(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lbu a1, 12(a0) -; RV32-NEXT: lw a2, 8(a0) -; RV32-NEXT: lw a3, 4(a0) ; RV32-NEXT: lw a0, 0(a0) -; RV32-NEXT: slli a4, a1, 30 +; RV32-NEXT: lw a1, 4(s0) +; RV32-NEXT: lw a2, 8(s0) +; RV32-NEXT: lbu a3, 12(s0) +; RV32-NEXT: slli a4, a3, 30 ; RV32-NEXT: srli s1, a2, 2 ; RV32-NEXT: slli a5, a2, 31 ; RV32-NEXT: or s1, s1, a4 -; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: srli a4, a1, 1 ; RV32-NEXT: or s2, a4, a5 -; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: srli a3, a3, 2 ; RV32-NEXT: srli a2, a2, 1 -; RV32-NEXT: slli a3, a3, 31 ; RV32-NEXT: slli a1, a1, 31 +; RV32-NEXT: slli a3, a3, 31 ; RV32-NEXT: slli a2, a2, 31 -; RV32-NEXT: srai s3, a1, 31 +; RV32-NEXT: srai s3, a3, 31 ; RV32-NEXT: srai s4, a2, 31 -; RV32-NEXT: srai a1, a3, 31 +; RV32-NEXT: srai a1, a1, 31 ; RV32-NEXT: li a2, 6 ; RV32-NEXT: li a3, 0 ; RV32-NEXT: call __moddi3 @@ -383,19 +383,19 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: mv s0, a0 -; RV64-NEXT: lbu a0, 12(a0) -; RV64-NEXT: ld a1, 0(s0) -; RV64-NEXT: lwu a2, 8(s0) -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a3, a1, 2 -; RV64-NEXT: or a0, a2, a0 -; RV64-NEXT: slli a2, a2, 62 -; RV64-NEXT: slli a1, a1, 31 -; RV64-NEXT: or a2, a2, a3 -; RV64-NEXT: slli s1, a0, 29 -; RV64-NEXT: srai a0, a2, 31 -; RV64-NEXT: srai s1, s1, 31 -; RV64-NEXT: srai s2, a1, 31 +; RV64-NEXT: ld a0, 0(a0) +; RV64-NEXT: lwu a1, 8(s0) +; RV64-NEXT: lbu a2, 12(s0) +; RV64-NEXT: slli a2, a2, 32 +; RV64-NEXT: srli a3, a0, 2 +; RV64-NEXT: or a2, a1, a2 +; RV64-NEXT: slli a1, a1, 62 +; RV64-NEXT: slli a4, a0, 31 +; RV64-NEXT: or a0, a1, a3 +; RV64-NEXT: slli a2, a2, 29 +; RV64-NEXT: srai a0, a0, 31 +; RV64-NEXT: srai s1, a2, 31 +; RV64-NEXT: srai s2, a4, 31 ; RV64-NEXT: li a1, 7 ; RV64-NEXT: call __moddi3 ; RV64-NEXT: mv s3, a0 @@ -456,24 +456,24 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32M-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32M-NEXT: sw s6, 0(sp) # 4-byte Folded Spill ; RV32M-NEXT: mv s0, a0 -; RV32M-NEXT: lbu a1, 12(a0) -; RV32M-NEXT: lw a2, 8(a0) -; RV32M-NEXT: lw a3, 4(a0) ; RV32M-NEXT: lw a0, 0(a0) -; RV32M-NEXT: slli a4, a1, 30 +; RV32M-NEXT: lw a1, 4(s0) +; RV32M-NEXT: lw a2, 8(s0) +; RV32M-NEXT: lbu a3, 12(s0) +; RV32M-NEXT: slli a4, a3, 30 ; RV32M-NEXT: srli s1, a2, 2 ; RV32M-NEXT: slli a5, a2, 31 ; RV32M-NEXT: or s1, s1, a4 -; RV32M-NEXT: srli a4, a3, 1 +; RV32M-NEXT: srli a4, a1, 1 ; RV32M-NEXT: or s2, a4, a5 -; RV32M-NEXT: srli a1, a1, 2 +; RV32M-NEXT: srli a3, a3, 2 ; RV32M-NEXT: srli a2, a2, 1 -; RV32M-NEXT: slli a3, a3, 31 ; RV32M-NEXT: slli a1, a1, 31 +; RV32M-NEXT: slli a3, a3, 31 ; RV32M-NEXT: slli a2, a2, 31 -; RV32M-NEXT: srai s3, a1, 31 +; RV32M-NEXT: srai s3, a3, 31 ; RV32M-NEXT: srai s4, a2, 31 -; RV32M-NEXT: srai a1, a3, 31 +; RV32M-NEXT: srai a1, a1, 31 ; RV32M-NEXT: li a2, 6 ; RV32M-NEXT: li a3, 0 ; RV32M-NEXT: call __moddi3 @@ -606,26 +606,26 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32MV-NEXT: slli a1, a1, 1 ; RV32MV-NEXT: sub sp, sp, a1 ; RV32MV-NEXT: mv s0, a0 -; RV32MV-NEXT: lw a1, 8(a0) -; RV32MV-NEXT: lbu a2, 12(a0) -; RV32MV-NEXT: lw a3, 4(a0) ; RV32MV-NEXT: lw a0, 0(a0) +; RV32MV-NEXT: lw a1, 4(s0) +; RV32MV-NEXT: lw a2, 8(s0) +; RV32MV-NEXT: lbu a3, 12(s0) ; RV32MV-NEXT: li a4, 1 -; RV32MV-NEXT: slli a5, a2, 30 -; RV32MV-NEXT: srli s1, a1, 2 -; RV32MV-NEXT: slli a6, a1, 31 +; RV32MV-NEXT: slli a5, a3, 30 +; RV32MV-NEXT: srli s1, a2, 2 +; RV32MV-NEXT: slli a6, a2, 31 ; RV32MV-NEXT: or s1, s1, a5 -; RV32MV-NEXT: srli a5, a3, 1 +; RV32MV-NEXT: srli a5, a1, 1 ; RV32MV-NEXT: or s2, a5, a6 ; RV32MV-NEXT: li a5, -1 -; RV32MV-NEXT: srli a2, a2, 2 -; RV32MV-NEXT: srli a1, a1, 1 +; RV32MV-NEXT: srli a3, a3, 2 +; RV32MV-NEXT: srli a2, a2, 1 +; RV32MV-NEXT: slli a1, a1, 31 ; RV32MV-NEXT: slli a3, a3, 31 ; RV32MV-NEXT: slli a2, a2, 31 -; RV32MV-NEXT: slli a6, a1, 31 -; RV32MV-NEXT: srai a1, a3, 31 -; RV32MV-NEXT: srai s3, a2, 31 -; RV32MV-NEXT: srai s4, a6, 31 +; RV32MV-NEXT: srai a1, a1, 31 +; RV32MV-NEXT: srai s3, a3, 31 +; RV32MV-NEXT: srai s4, a2, 31 ; RV32MV-NEXT: sw a5, 16(sp) ; RV32MV-NEXT: sw a4, 20(sp) ; RV32MV-NEXT: li a2, 6 @@ -653,17 +653,18 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32MV-NEXT: mv a0, s1 ; RV32MV-NEXT: mv a1, s3 ; RV32MV-NEXT: call __moddi3 -; RV32MV-NEXT: addi a2, sp, 16 -; RV32MV-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32MV-NEXT: vlse64.v v8, (a2), zero ; RV32MV-NEXT: addi a2, sp, 32 -; RV32MV-NEXT: vl2r.v v10, (a2) # Unknown-size Folded Reload +; RV32MV-NEXT: vl2r.v v8, (a2) # Unknown-size Folded Reload +; RV32MV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32MV-NEXT: vslide1down.vx v8, v8, a0 +; RV32MV-NEXT: addi a0, sp, 16 +; RV32MV-NEXT: vslide1down.vx v8, v8, a1 +; RV32MV-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32MV-NEXT: vlse64.v v10, (a0), zero ; RV32MV-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32MV-NEXT: vslide1down.vx v10, v10, a0 -; RV32MV-NEXT: vslide1down.vx v10, v10, a1 -; RV32MV-NEXT: vslidedown.vi v10, v10, 2 +; RV32MV-NEXT: vslidedown.vi v8, v8, 2 ; RV32MV-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32MV-NEXT: vand.vv v8, v10, v8 +; RV32MV-NEXT: vand.vv v8, v8, v10 ; RV32MV-NEXT: vsetivli zero, 3, e8, mf2, ta, ma ; RV32MV-NEXT: vmv.v.i v10, 1 ; RV32MV-NEXT: vsetivli zero, 8, e8, mf2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index cf65d4e0cf805..5cb7e1388a08f 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -18,30 +18,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lh a2, 0(a1) -; RV32I-NEXT: lh s0, 4(a1) -; RV32I-NEXT: lh s1, 8(a1) -; RV32I-NEXT: lh s2, 12(a1) -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lh a0, 0(a1) +; RV32I-NEXT: lh s1, 4(a1) +; RV32I-NEXT: lh s2, 8(a1) +; RV32I-NEXT: lh s3, 12(a1) ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __modsi3 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, -124 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: li a1, 98 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __modsi3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: li a1, -1003 +; RV32I-NEXT: li a1, 98 ; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh s4, 0(s3) -; RV32I-NEXT: sh s0, 2(s3) -; RV32I-NEXT: sh s1, 4(s3) -; RV32I-NEXT: sh a0, 6(s3) +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: li a1, -1003 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh s4, 0(s0) +; RV32I-NEXT: sh s1, 2(s0) +; RV32I-NEXT: sh s2, 4(s0) +; RV32I-NEXT: sh a0, 6(s0) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -110,30 +109,29 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lh a2, 0(a1) -; RV64I-NEXT: lh s0, 8(a1) -; RV64I-NEXT: lh s1, 16(a1) -; RV64I-NEXT: lh s2, 24(a1) -; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lh a0, 0(a1) +; RV64I-NEXT: lh s1, 8(a1) +; RV64I-NEXT: lh s2, 16(a1) +; RV64I-NEXT: lh s3, 24(a1) ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, -124 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: li a1, 98 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: li a1, -1003 +; RV64I-NEXT: li a1, 98 ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh s4, 0(s3) -; RV64I-NEXT: sh s0, 2(s3) -; RV64I-NEXT: sh s1, 4(s3) -; RV64I-NEXT: sh a0, 6(s3) +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: li a1, -1003 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh s4, 0(s0) +; RV64I-NEXT: sh s1, 2(s0) +; RV64I-NEXT: sh s2, 4(s0) +; RV64I-NEXT: sh a0, 6(s0) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -206,30 +204,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lh a2, 0(a1) -; RV32I-NEXT: lh s0, 4(a1) -; RV32I-NEXT: lh s1, 8(a1) -; RV32I-NEXT: lh s2, 12(a1) -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lh a0, 0(a1) +; RV32I-NEXT: lh s1, 4(a1) +; RV32I-NEXT: lh s2, 8(a1) +; RV32I-NEXT: lh s3, 12(a1) ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __modsi3 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __modsi3 ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh s4, 0(s3) -; RV32I-NEXT: sh s0, 2(s3) -; RV32I-NEXT: sh s1, 4(s3) -; RV32I-NEXT: sh a0, 6(s3) +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: li a1, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh s4, 0(s0) +; RV32I-NEXT: sh s1, 2(s0) +; RV32I-NEXT: sh s2, 4(s0) +; RV32I-NEXT: sh a0, 6(s0) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -291,30 +288,29 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lh a2, 0(a1) -; RV64I-NEXT: lh s0, 8(a1) -; RV64I-NEXT: lh s1, 16(a1) -; RV64I-NEXT: lh s2, 24(a1) -; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lh a0, 0(a1) +; RV64I-NEXT: lh s1, 8(a1) +; RV64I-NEXT: lh s2, 16(a1) +; RV64I-NEXT: lh s3, 24(a1) ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh s4, 0(s3) -; RV64I-NEXT: sh s0, 2(s3) -; RV64I-NEXT: sh s1, 4(s3) -; RV64I-NEXT: sh a0, 6(s3) +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: li a1, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh s4, 0(s0) +; RV64I-NEXT: sh s1, 2(s0) +; RV64I-NEXT: sh s2, 4(s0) +; RV64I-NEXT: sh a0, 6(s0) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -326,20 +322,20 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_srem_vec_2: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lui a2, %hi(.LCPI1_0) -; RV64IM-NEXT: ld a2, %lo(.LCPI1_0)(a2) -; RV64IM-NEXT: lh a3, 0(a1) -; RV64IM-NEXT: lh a4, 8(a1) -; RV64IM-NEXT: lh a5, 16(a1) +; RV64IM-NEXT: lh a2, 0(a1) +; RV64IM-NEXT: lh a3, 8(a1) +; RV64IM-NEXT: lh a4, 16(a1) ; RV64IM-NEXT: lh a1, 24(a1) -; RV64IM-NEXT: mulh a6, a3, a2 -; RV64IM-NEXT: mulh a7, a4, a2 -; RV64IM-NEXT: mulh t0, a5, a2 -; RV64IM-NEXT: mulh a2, a1, a2 -; RV64IM-NEXT: add a6, a6, a3 -; RV64IM-NEXT: add a7, a7, a4 -; RV64IM-NEXT: add t0, t0, a5 -; RV64IM-NEXT: add a2, a2, a1 +; RV64IM-NEXT: lui a5, %hi(.LCPI1_0) +; RV64IM-NEXT: ld a5, %lo(.LCPI1_0)(a5) +; RV64IM-NEXT: mulh a6, a2, a5 +; RV64IM-NEXT: mulh a7, a3, a5 +; RV64IM-NEXT: mulh t0, a4, a5 +; RV64IM-NEXT: mulh a5, a1, a5 +; RV64IM-NEXT: add a6, a6, a2 +; RV64IM-NEXT: add a7, a7, a3 +; RV64IM-NEXT: add t0, t0, a4 +; RV64IM-NEXT: add a5, a5, a1 ; RV64IM-NEXT: srli t1, a6, 63 ; RV64IM-NEXT: srli a6, a6, 6 ; RV64IM-NEXT: add a6, a6, t1 @@ -349,21 +345,21 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: srli t1, t0, 63 ; RV64IM-NEXT: srli t0, t0, 6 ; RV64IM-NEXT: add t0, t0, t1 -; RV64IM-NEXT: srli t1, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, t1 +; RV64IM-NEXT: srli t1, a5, 63 +; RV64IM-NEXT: srli a5, a5, 6 +; RV64IM-NEXT: add a5, a5, t1 ; RV64IM-NEXT: li t1, 95 ; RV64IM-NEXT: mul a6, a6, t1 ; RV64IM-NEXT: mul a7, a7, t1 ; RV64IM-NEXT: mul t0, t0, t1 -; RV64IM-NEXT: mul a2, a2, t1 -; RV64IM-NEXT: subw a3, a3, a6 -; RV64IM-NEXT: subw a4, a4, a7 -; RV64IM-NEXT: subw a5, a5, t0 -; RV64IM-NEXT: subw a1, a1, a2 -; RV64IM-NEXT: sh a3, 0(a0) -; RV64IM-NEXT: sh a4, 2(a0) -; RV64IM-NEXT: sh a5, 4(a0) +; RV64IM-NEXT: mul a5, a5, t1 +; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: subw a3, a3, a7 +; RV64IM-NEXT: subw a4, a4, t0 +; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a4, 4(a0) ; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, @@ -386,11 +382,11 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lh s1, 0(a1) ; RV32I-NEXT: lh s2, 4(a1) ; RV32I-NEXT: lh s3, 8(a1) ; RV32I-NEXT: lh s4, 12(a1) -; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s4 ; RV32I-NEXT: call __modsi3 @@ -503,11 +499,11 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s6, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s7, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s8, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lh s1, 0(a1) ; RV64I-NEXT: lh s2, 8(a1) ; RV64I-NEXT: lh s3, 16(a1) ; RV64I-NEXT: lh s4, 24(a1) -; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s4 ; RV64I-NEXT: call __moddi3 @@ -562,49 +558,49 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: combine_srem_sdiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a2, 16(a1) -; RV64IM-NEXT: lh a3, 24(a1) -; RV64IM-NEXT: lui a4, %hi(.LCPI2_0) -; RV64IM-NEXT: ld a4, %lo(.LCPI2_0)(a4) -; RV64IM-NEXT: lh a5, 0(a1) -; RV64IM-NEXT: lh a1, 8(a1) +; RV64IM-NEXT: lh a2, 0(a1) +; RV64IM-NEXT: lh a3, 8(a1) +; RV64IM-NEXT: lh a4, 16(a1) +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: lui a5, %hi(.LCPI2_0) ; RV64IM-NEXT: li a6, 95 -; RV64IM-NEXT: mulh a7, a3, a4 -; RV64IM-NEXT: mulh t0, a2, a4 -; RV64IM-NEXT: mulh t1, a1, a4 -; RV64IM-NEXT: mulh a4, a5, a4 -; RV64IM-NEXT: add a7, a7, a3 -; RV64IM-NEXT: add t0, t0, a2 -; RV64IM-NEXT: add t1, t1, a1 -; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: ld a5, %lo(.LCPI2_0)(a5) +; RV64IM-NEXT: mulh a7, a1, a5 +; RV64IM-NEXT: mulh t0, a4, a5 +; RV64IM-NEXT: mulh t1, a3, a5 +; RV64IM-NEXT: mulh a5, a2, a5 +; RV64IM-NEXT: add a7, a7, a1 +; RV64IM-NEXT: add t0, t0, a4 +; RV64IM-NEXT: add t1, t1, a3 +; RV64IM-NEXT: add a5, a5, a2 ; RV64IM-NEXT: srli t2, a7, 63 ; RV64IM-NEXT: srai a7, a7, 6 ; RV64IM-NEXT: srli t3, t0, 63 ; RV64IM-NEXT: srai t0, t0, 6 ; RV64IM-NEXT: srli t4, t1, 63 ; RV64IM-NEXT: srai t1, t1, 6 -; RV64IM-NEXT: srli t5, a4, 63 -; RV64IM-NEXT: srai a4, a4, 6 +; RV64IM-NEXT: srli t5, a5, 63 +; RV64IM-NEXT: srai a5, a5, 6 ; RV64IM-NEXT: add a7, a7, t2 ; RV64IM-NEXT: add t0, t0, t3 ; RV64IM-NEXT: add t1, t1, t4 -; RV64IM-NEXT: add a4, a4, t5 +; RV64IM-NEXT: add a5, a5, t5 ; RV64IM-NEXT: mul t2, a7, a6 ; RV64IM-NEXT: mul t3, t0, a6 ; RV64IM-NEXT: mul t4, t1, a6 -; RV64IM-NEXT: mul a6, a4, a6 -; RV64IM-NEXT: add a4, a5, a4 -; RV64IM-NEXT: add a1, a1, t1 -; RV64IM-NEXT: add a2, a2, t0 -; RV64IM-NEXT: add a3, a3, a7 -; RV64IM-NEXT: subw a4, a4, a6 -; RV64IM-NEXT: subw a1, a1, t4 -; RV64IM-NEXT: subw a2, a2, t3 -; RV64IM-NEXT: subw a3, a3, t2 -; RV64IM-NEXT: sh a4, 0(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a2, 4(a0) -; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: mul a6, a5, a6 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: add a3, a3, t1 +; RV64IM-NEXT: add a4, a4, t0 +; RV64IM-NEXT: add a1, a1, a7 +; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: subw a3, a3, t4 +; RV64IM-NEXT: subw a4, a4, t3 +; RV64IM-NEXT: subw a1, a1, t2 +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a4, 4(a0) +; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, @@ -655,36 +651,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: dont_fold_srem_power_of_two: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a2, 4(a1) -; RV32IM-NEXT: lh a3, 8(a1) -; RV32IM-NEXT: lh a4, 12(a1) -; RV32IM-NEXT: lh a1, 0(a1) +; RV32IM-NEXT: lh a2, 0(a1) +; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a4, 8(a1) +; RV32IM-NEXT: lh a1, 12(a1) ; RV32IM-NEXT: lui a5, 706409 ; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a5, a4, a5 -; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: mulh a5, a1, a5 +; RV32IM-NEXT: add a5, a5, a1 ; RV32IM-NEXT: srli a6, a5, 31 ; RV32IM-NEXT: srli a5, a5, 6 ; RV32IM-NEXT: add a5, a5, a6 -; RV32IM-NEXT: srli a6, a1, 26 -; RV32IM-NEXT: add a6, a1, a6 -; RV32IM-NEXT: andi a6, a6, -64 -; RV32IM-NEXT: sub a1, a1, a6 -; RV32IM-NEXT: srli a6, a2, 27 +; RV32IM-NEXT: srli a6, a2, 26 ; RV32IM-NEXT: add a6, a2, a6 -; RV32IM-NEXT: andi a6, a6, -32 +; RV32IM-NEXT: andi a6, a6, -64 ; RV32IM-NEXT: sub a2, a2, a6 -; RV32IM-NEXT: srli a6, a3, 29 +; RV32IM-NEXT: srli a6, a3, 27 ; RV32IM-NEXT: add a6, a3, a6 -; RV32IM-NEXT: andi a6, a6, -8 +; RV32IM-NEXT: andi a6, a6, -32 ; RV32IM-NEXT: sub a3, a3, a6 +; RV32IM-NEXT: srli a6, a4, 29 +; RV32IM-NEXT: add a6, a4, a6 +; RV32IM-NEXT: andi a6, a6, -8 +; RV32IM-NEXT: sub a4, a4, a6 ; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: mul a5, a5, a6 -; RV32IM-NEXT: sub a4, a4, a5 -; RV32IM-NEXT: sh a1, 0(a0) -; RV32IM-NEXT: sh a2, 2(a0) -; RV32IM-NEXT: sh a3, 4(a0) -; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sub a1, a1, a5 +; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a4, 4(a0) +; RV32IM-NEXT: sh a1, 6(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_srem_power_of_two: @@ -773,26 +769,25 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lh a2, 4(a1) -; RV32I-NEXT: lh s0, 8(a1) -; RV32I-NEXT: lh s1, 12(a1) -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lh a0, 4(a1) +; RV32I-NEXT: lh s1, 8(a1) +; RV32I-NEXT: lh s2, 12(a1) ; RV32I-NEXT: li a1, 654 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __modsi3 ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: li a1, 23 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a1, a0, 1327 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __modsi3 -; RV32I-NEXT: sh zero, 0(s2) -; RV32I-NEXT: sh s3, 2(s2) -; RV32I-NEXT: sh s0, 4(s2) -; RV32I-NEXT: sh a0, 6(s2) +; RV32I-NEXT: sh zero, 0(s0) +; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh a0, 6(s0) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -850,26 +845,25 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lh a2, 8(a1) -; RV64I-NEXT: lh s0, 16(a1) -; RV64I-NEXT: lh s1, 24(a1) -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lh a0, 8(a1) +; RV64I-NEXT: lh s1, 16(a1) +; RV64I-NEXT: lh s2, 24(a1) ; RV64I-NEXT: li a1, 654 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sh zero, 0(s2) -; RV64I-NEXT: sh s3, 2(s2) -; RV64I-NEXT: sh s0, 4(s2) -; RV64I-NEXT: sh a0, 6(s2) +; RV64I-NEXT: sh zero, 0(s0) +; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh a0, 6(s0) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -1036,31 +1030,31 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV64IM-NEXT: lh a2, 8(a1) ; RV64IM-NEXT: lh a3, 16(a1) ; RV64IM-NEXT: lh a1, 24(a1) -; RV64IM-NEXT: lui a4, %hi(.LCPI5_0) -; RV64IM-NEXT: lui a5, %hi(.LCPI5_1) -; RV64IM-NEXT: ld a5, %lo(.LCPI5_1)(a5) -; RV64IM-NEXT: lui a6, 8 -; RV64IM-NEXT: ld a4, %lo(.LCPI5_0)(a4) -; RV64IM-NEXT: srli a7, a2, 49 -; RV64IM-NEXT: mulh a5, a1, a5 -; RV64IM-NEXT: add a7, a2, a7 -; RV64IM-NEXT: and a6, a7, a6 -; RV64IM-NEXT: srli a7, a5, 63 -; RV64IM-NEXT: srli a5, a5, 11 -; RV64IM-NEXT: add a5, a5, a7 -; RV64IM-NEXT: mulh a4, a3, a4 -; RV64IM-NEXT: add a4, a4, a3 -; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: lui a4, %hi(.LCPI5_1) +; RV64IM-NEXT: lui a5, 8 +; RV64IM-NEXT: ld a4, %lo(.LCPI5_1)(a4) +; RV64IM-NEXT: srli a6, a2, 49 +; RV64IM-NEXT: mulh a4, a1, a4 +; RV64IM-NEXT: add a6, a2, a6 +; RV64IM-NEXT: and a5, a6, a5 ; RV64IM-NEXT: srli a6, a4, 63 -; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: srli a4, a4, 11 ; RV64IM-NEXT: add a4, a4, a6 +; RV64IM-NEXT: lui a6, %hi(.LCPI5_0) +; RV64IM-NEXT: ld a6, %lo(.LCPI5_0)(a6) +; RV64IM-NEXT: mulh a6, a3, a6 +; RV64IM-NEXT: add a6, a6, a3 +; RV64IM-NEXT: subw a2, a2, a5 +; RV64IM-NEXT: srli a5, a6, 63 +; RV64IM-NEXT: srli a6, a6, 4 +; RV64IM-NEXT: add a5, a6, a5 ; RV64IM-NEXT: lui a6, 1 ; RV64IM-NEXT: addi a6, a6, 1327 -; RV64IM-NEXT: mul a5, a5, a6 -; RV64IM-NEXT: li a6, 23 ; RV64IM-NEXT: mul a4, a4, a6 -; RV64IM-NEXT: subw a1, a1, a5 -; RV64IM-NEXT: subw a3, a3, a4 +; RV64IM-NEXT: li a6, 23 +; RV64IM-NEXT: mul a5, a5, a6 +; RV64IM-NEXT: subw a1, a1, a4 +; RV64IM-NEXT: subw a3, a3, a5 ; RV64IM-NEXT: sh zero, 0(a0) ; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) @@ -1085,18 +1079,17 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw s1, 16(a1) ; RV32I-NEXT: lw s2, 20(a1) ; RV32I-NEXT: lw s3, 24(a1) ; RV32I-NEXT: lw s4, 28(a1) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw a3, 4(a1) ; RV32I-NEXT: lw s5, 8(a1) ; RV32I-NEXT: lw s6, 12(a1) -; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3 ; RV32I-NEXT: mv s7, a0 @@ -1155,18 +1148,17 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill +; RV32IM-NEXT: mv s0, a0 ; RV32IM-NEXT: lw s1, 16(a1) ; RV32IM-NEXT: lw s2, 20(a1) ; RV32IM-NEXT: lw s3, 24(a1) ; RV32IM-NEXT: lw s4, 28(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a4, 4(a1) +; RV32IM-NEXT: lw a0, 0(a1) +; RV32IM-NEXT: lw a3, 4(a1) ; RV32IM-NEXT: lw s5, 8(a1) ; RV32IM-NEXT: lw s6, 12(a1) -; RV32IM-NEXT: mv s0, a0 ; RV32IM-NEXT: li a2, 1 -; RV32IM-NEXT: mv a0, a3 -; RV32IM-NEXT: mv a1, a4 +; RV32IM-NEXT: mv a1, a3 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3 ; RV32IM-NEXT: mv s7, a0 @@ -1220,26 +1212,25 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: ld a2, 8(a1) -; RV64I-NEXT: ld s0, 16(a1) -; RV64I-NEXT: ld s1, 24(a1) -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: ld a0, 8(a1) +; RV64I-NEXT: ld s1, 16(a1) +; RV64I-NEXT: ld s2, 24(a1) ; RV64I-NEXT: li a1, 654 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __moddi3 -; RV64I-NEXT: sd zero, 0(s2) -; RV64I-NEXT: sd s3, 8(s2) -; RV64I-NEXT: sd s0, 16(s2) -; RV64I-NEXT: sd a0, 24(s2) +; RV64I-NEXT: sd zero, 0(s0) +; RV64I-NEXT: sd s3, 8(s0) +; RV64I-NEXT: sd s1, 16(s0) +; RV64I-NEXT: sd a0, 24(s0) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/stack-slot-size.ll b/llvm/test/CodeGen/RISCV/stack-slot-size.ll index 71ee6d8160a9d..4691cb6032bcc 100644 --- a/llvm/test/CodeGen/RISCV/stack-slot-size.ll +++ b/llvm/test/CodeGen/RISCV/stack-slot-size.ll @@ -21,11 +21,11 @@ define i32 @caller129() nounwind { ; RV32I-NEXT: li a0, 42 ; RV32I-NEXT: sw a0, 24(sp) ; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: mv a0, sp ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: mv a0, sp ; RV32I-NEXT: call callee129 ; RV32I-NEXT: lw a0, 24(sp) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -38,10 +38,10 @@ define i32 @caller129() nounwind { ; RV64I-NEXT: sd ra, 40(sp) # 8-byte Folded Spill ; RV64I-NEXT: li a0, 42 ; RV64I-NEXT: sw a0, 36(sp) -; RV64I-NEXT: mv a0, sp ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) +; RV64I-NEXT: mv a0, sp ; RV64I-NEXT: call callee129 ; RV64I-NEXT: lw a0, 36(sp) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -62,11 +62,11 @@ define i32 @caller160() nounwind { ; RV32I-NEXT: li a0, 42 ; RV32I-NEXT: sw a0, 24(sp) ; RV32I-NEXT: sw zero, 16(sp) -; RV32I-NEXT: mv a0, sp ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: mv a0, sp ; RV32I-NEXT: call callee160 ; RV32I-NEXT: lw a0, 24(sp) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -79,10 +79,10 @@ define i32 @caller160() nounwind { ; RV64I-NEXT: sd ra, 40(sp) # 8-byte Folded Spill ; RV64I-NEXT: li a0, 42 ; RV64I-NEXT: sw a0, 36(sp) -; RV64I-NEXT: mv a0, sp ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) +; RV64I-NEXT: mv a0, sp ; RV64I-NEXT: call callee160 ; RV64I-NEXT: lw a0, 36(sp) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -104,11 +104,11 @@ define i32 @caller161() nounwind { ; RV32I-NEXT: sw a0, 24(sp) ; RV32I-NEXT: sw zero, 16(sp) ; RV32I-NEXT: sw zero, 20(sp) -; RV32I-NEXT: mv a0, sp ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: mv a0, sp ; RV32I-NEXT: call callee161 ; RV32I-NEXT: lw a0, 24(sp) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -121,10 +121,10 @@ define i32 @caller161() nounwind { ; RV64I-NEXT: sd ra, 40(sp) # 8-byte Folded Spill ; RV64I-NEXT: li a0, 42 ; RV64I-NEXT: sw a0, 36(sp) -; RV64I-NEXT: mv a0, sp ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) +; RV64I-NEXT: mv a0, sp ; RV64I-NEXT: call callee161 ; RV64I-NEXT: lw a0, 36(sp) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll index cd1aebfea5ce4..27fa059ce5429 100644 --- a/llvm/test/CodeGen/RISCV/stack-store-check.ll +++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll @@ -29,37 +29,37 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw s10, 656(sp) # 4-byte Folded Spill ; CHECK-NEXT: sw s11, 652(sp) # 4-byte Folded Spill ; CHECK-NEXT: lui a0, %hi(U) -; CHECK-NEXT: lw s9, %lo(U)(a0) -; CHECK-NEXT: lw s10, %lo(U+4)(a0) -; CHECK-NEXT: lw s11, %lo(U+8)(a0) -; CHECK-NEXT: lw s5, %lo(U+12)(a0) +; CHECK-NEXT: lw s6, %lo(U)(a0) +; CHECK-NEXT: lw s7, %lo(U+4)(a0) +; CHECK-NEXT: lw s8, %lo(U+8)(a0) +; CHECK-NEXT: lw s0, %lo(U+12)(a0) ; CHECK-NEXT: sw zero, 616(sp) ; CHECK-NEXT: sw zero, 620(sp) ; CHECK-NEXT: sw zero, 624(sp) ; CHECK-NEXT: sw zero, 628(sp) +; CHECK-NEXT: sw s6, 600(sp) +; CHECK-NEXT: sw s7, 604(sp) +; CHECK-NEXT: sw s8, 608(sp) +; CHECK-NEXT: sw s0, 612(sp) ; CHECK-NEXT: addi a0, sp, 632 ; CHECK-NEXT: addi a1, sp, 616 ; CHECK-NEXT: addi a2, sp, 600 -; CHECK-NEXT: sw s9, 600(sp) -; CHECK-NEXT: sw s10, 604(sp) -; CHECK-NEXT: sw s11, 608(sp) -; CHECK-NEXT: sw s5, 612(sp) ; CHECK-NEXT: call __subtf3 ; CHECK-NEXT: lw s1, 632(sp) ; CHECK-NEXT: lw s2, 636(sp) ; CHECK-NEXT: lw s3, 640(sp) ; CHECK-NEXT: lw s4, 644(sp) -; CHECK-NEXT: sw s9, 552(sp) -; CHECK-NEXT: sw s10, 556(sp) -; CHECK-NEXT: sw s11, 560(sp) -; CHECK-NEXT: sw s5, 564(sp) -; CHECK-NEXT: addi a0, sp, 584 -; CHECK-NEXT: addi a1, sp, 568 -; CHECK-NEXT: addi a2, sp, 552 +; CHECK-NEXT: sw s6, 552(sp) +; CHECK-NEXT: sw s7, 556(sp) +; CHECK-NEXT: sw s8, 560(sp) +; CHECK-NEXT: sw s0, 564(sp) ; CHECK-NEXT: sw s1, 568(sp) ; CHECK-NEXT: sw s2, 572(sp) ; CHECK-NEXT: sw s3, 576(sp) ; CHECK-NEXT: sw s4, 580(sp) +; CHECK-NEXT: addi a0, sp, 584 +; CHECK-NEXT: addi a1, sp, 568 +; CHECK-NEXT: addi a2, sp, 552 ; CHECK-NEXT: call __subtf3 ; CHECK-NEXT: lw a0, 584(sp) ; CHECK-NEXT: sw a0, 52(sp) # 4-byte Folded Spill @@ -73,18 +73,22 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw zero, 508(sp) ; CHECK-NEXT: sw zero, 512(sp) ; CHECK-NEXT: sw zero, 516(sp) +; CHECK-NEXT: sw s6, 520(sp) +; CHECK-NEXT: sw s7, 524(sp) +; CHECK-NEXT: sw s8, 528(sp) +; CHECK-NEXT: sw s0, 532(sp) ; CHECK-NEXT: addi a0, sp, 536 ; CHECK-NEXT: addi a1, sp, 520 ; CHECK-NEXT: addi a2, sp, 504 -; CHECK-NEXT: sw s9, 520(sp) -; CHECK-NEXT: sw s10, 524(sp) -; CHECK-NEXT: sw s11, 528(sp) -; CHECK-NEXT: sw s5, 532(sp) ; CHECK-NEXT: call __addtf3 -; CHECK-NEXT: lw s0, 536(sp) -; CHECK-NEXT: lw s6, 540(sp) -; CHECK-NEXT: lw s7, 544(sp) -; CHECK-NEXT: lw s8, 548(sp) +; CHECK-NEXT: lw s5, 536(sp) +; CHECK-NEXT: sw s5, 36(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s9, 540(sp) +; CHECK-NEXT: sw s9, 32(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s10, 544(sp) +; CHECK-NEXT: sw s10, 28(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s11, 548(sp) +; CHECK-NEXT: sw s11, 24(sp) # 4-byte Folded Spill ; CHECK-NEXT: lui a0, %hi(Y1) ; CHECK-NEXT: lw a1, %lo(Y1)(a0) ; CHECK-NEXT: sw a1, 20(sp) # 4-byte Folded Spill @@ -98,13 +102,13 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw a2, 316(sp) ; CHECK-NEXT: sw a3, 320(sp) ; CHECK-NEXT: sw a0, 324(sp) -; CHECK-NEXT: addi a0, sp, 344 -; CHECK-NEXT: addi a1, sp, 328 -; CHECK-NEXT: addi a2, sp, 312 ; CHECK-NEXT: sw s1, 328(sp) ; CHECK-NEXT: sw s2, 332(sp) ; CHECK-NEXT: sw s3, 336(sp) ; CHECK-NEXT: sw s4, 340(sp) +; CHECK-NEXT: addi a0, sp, 344 +; CHECK-NEXT: addi a1, sp, 328 +; CHECK-NEXT: addi a2, sp, 312 ; CHECK-NEXT: call __multf3 ; CHECK-NEXT: lw a0, 344(sp) ; CHECK-NEXT: sw a0, 68(sp) # 4-byte Folded Spill @@ -114,180 +118,176 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw a0, 60(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 356(sp) ; CHECK-NEXT: sw a0, 56(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s9, 472(sp) -; CHECK-NEXT: sw s10, 476(sp) -; CHECK-NEXT: sw s11, 480(sp) -; CHECK-NEXT: sw s5, 484(sp) +; CHECK-NEXT: sw s6, 472(sp) +; CHECK-NEXT: sw s7, 476(sp) +; CHECK-NEXT: sw s8, 480(sp) +; CHECK-NEXT: sw s0, 484(sp) +; CHECK-NEXT: sw s5, 456(sp) +; CHECK-NEXT: sw s9, 460(sp) +; CHECK-NEXT: sw s10, 464(sp) +; CHECK-NEXT: sw s11, 468(sp) ; CHECK-NEXT: addi a0, sp, 488 ; CHECK-NEXT: addi a1, sp, 472 ; CHECK-NEXT: addi a2, sp, 456 -; CHECK-NEXT: sw s0, 456(sp) -; CHECK-NEXT: sw s6, 460(sp) -; CHECK-NEXT: sw s7, 464(sp) -; CHECK-NEXT: sw s8, 468(sp) ; CHECK-NEXT: call __addtf3 -; CHECK-NEXT: lw a3, 488(sp) -; CHECK-NEXT: lw a4, 492(sp) -; CHECK-NEXT: lw a5, 496(sp) -; CHECK-NEXT: lw a6, 500(sp) +; CHECK-NEXT: lw a0, 488(sp) +; CHECK-NEXT: lw a1, 492(sp) +; CHECK-NEXT: lw a2, 496(sp) +; CHECK-NEXT: lw a3, 500(sp) ; CHECK-NEXT: sw zero, 424(sp) ; CHECK-NEXT: sw zero, 428(sp) ; CHECK-NEXT: sw zero, 432(sp) ; CHECK-NEXT: sw zero, 436(sp) +; CHECK-NEXT: sw a0, 408(sp) +; CHECK-NEXT: sw a1, 412(sp) +; CHECK-NEXT: sw a2, 416(sp) +; CHECK-NEXT: sw a3, 420(sp) ; CHECK-NEXT: addi a0, sp, 440 ; CHECK-NEXT: addi a1, sp, 424 ; CHECK-NEXT: addi a2, sp, 408 -; CHECK-NEXT: sw a3, 408(sp) -; CHECK-NEXT: sw a4, 412(sp) -; CHECK-NEXT: sw a5, 416(sp) -; CHECK-NEXT: sw a6, 420(sp) ; CHECK-NEXT: call __subtf3 -; CHECK-NEXT: lw a0, 448(sp) -; CHECK-NEXT: lw a1, 452(sp) -; CHECK-NEXT: lw a2, 440(sp) -; CHECK-NEXT: lw a3, 444(sp) +; CHECK-NEXT: lw a0, 440(sp) +; CHECK-NEXT: lw a1, 444(sp) +; CHECK-NEXT: lw a2, 448(sp) +; CHECK-NEXT: lw a3, 452(sp) ; CHECK-NEXT: lui a4, %hi(X) -; CHECK-NEXT: sw a1, %lo(X+12)(a4) -; CHECK-NEXT: sw a0, %lo(X+8)(a4) -; CHECK-NEXT: sw a3, %lo(X+4)(a4) -; CHECK-NEXT: sw a2, %lo(X)(a4) -; CHECK-NEXT: lw s5, 20(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s5, 216(sp) -; CHECK-NEXT: lw s9, 16(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s9, 220(sp) -; CHECK-NEXT: lw s10, 12(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s10, 224(sp) -; CHECK-NEXT: lw s11, 8(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s11, 228(sp) +; CHECK-NEXT: sw a3, %lo(X+12)(a4) +; CHECK-NEXT: sw a2, %lo(X+8)(a4) +; CHECK-NEXT: sw a1, %lo(X+4)(a4) +; CHECK-NEXT: sw a0, %lo(X)(a4) +; CHECK-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s1, 216(sp) +; CHECK-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s2, 220(sp) +; CHECK-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s3, 224(sp) +; CHECK-NEXT: lw s4, 8(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s4, 228(sp) +; CHECK-NEXT: lw s5, 52(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s5, 232(sp) +; CHECK-NEXT: lw s9, 48(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s9, 236(sp) +; CHECK-NEXT: lw s10, 44(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s10, 240(sp) +; CHECK-NEXT: lw s11, 40(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s11, 244(sp) ; CHECK-NEXT: addi a0, sp, 248 ; CHECK-NEXT: addi a1, sp, 232 ; CHECK-NEXT: addi a2, sp, 216 -; CHECK-NEXT: lw s1, 52(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s1, 232(sp) -; CHECK-NEXT: lw s2, 48(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s2, 236(sp) -; CHECK-NEXT: lw s3, 44(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s3, 240(sp) -; CHECK-NEXT: lw s4, 40(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s4, 244(sp) ; CHECK-NEXT: call __multf3 -; CHECK-NEXT: lw a0, 248(sp) -; CHECK-NEXT: sw a0, 36(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw a0, 252(sp) -; CHECK-NEXT: sw a0, 32(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw a0, 256(sp) -; CHECK-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw a0, 260(sp) -; CHECK-NEXT: sw a0, 24(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s0, 248(sp) +; CHECK-NEXT: lw s6, 252(sp) +; CHECK-NEXT: lw s7, 256(sp) +; CHECK-NEXT: lw s8, 260(sp) ; CHECK-NEXT: sw zero, 360(sp) ; CHECK-NEXT: sw zero, 364(sp) ; CHECK-NEXT: sw zero, 368(sp) ; CHECK-NEXT: sw zero, 372(sp) +; CHECK-NEXT: lw a0, 36(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 376(sp) +; CHECK-NEXT: lw a0, 32(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 380(sp) +; CHECK-NEXT: lw a0, 28(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 384(sp) +; CHECK-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 388(sp) ; CHECK-NEXT: addi a0, sp, 392 ; CHECK-NEXT: addi a1, sp, 376 ; CHECK-NEXT: addi a2, sp, 360 -; CHECK-NEXT: sw s0, 376(sp) -; CHECK-NEXT: sw s6, 380(sp) -; CHECK-NEXT: sw s7, 384(sp) -; CHECK-NEXT: sw s8, 388(sp) ; CHECK-NEXT: call __multf3 -; CHECK-NEXT: lw a0, 400(sp) -; CHECK-NEXT: lw a1, 404(sp) -; CHECK-NEXT: lw a2, 392(sp) -; CHECK-NEXT: lw a3, 396(sp) +; CHECK-NEXT: lw a0, 392(sp) +; CHECK-NEXT: lw a1, 396(sp) +; CHECK-NEXT: lw a2, 400(sp) +; CHECK-NEXT: lw a3, 404(sp) ; CHECK-NEXT: lui a4, %hi(S) -; CHECK-NEXT: sw a1, %lo(S+12)(a4) -; CHECK-NEXT: sw a0, %lo(S+8)(a4) -; CHECK-NEXT: sw a3, %lo(S+4)(a4) -; CHECK-NEXT: sw a2, %lo(S)(a4) -; CHECK-NEXT: sw s1, 264(sp) -; CHECK-NEXT: sw s2, 268(sp) -; CHECK-NEXT: sw s3, 272(sp) -; CHECK-NEXT: sw s4, 276(sp) +; CHECK-NEXT: sw a3, %lo(S+12)(a4) +; CHECK-NEXT: sw a2, %lo(S+8)(a4) +; CHECK-NEXT: sw a1, %lo(S+4)(a4) +; CHECK-NEXT: sw a0, %lo(S)(a4) +; CHECK-NEXT: sw s5, 264(sp) +; CHECK-NEXT: sw s9, 268(sp) +; CHECK-NEXT: sw s10, 272(sp) +; CHECK-NEXT: sw s11, 276(sp) +; CHECK-NEXT: lw a0, 68(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 280(sp) +; CHECK-NEXT: lw a0, 64(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 284(sp) +; CHECK-NEXT: lw a0, 60(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 288(sp) +; CHECK-NEXT: lw a0, 56(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 292(sp) ; CHECK-NEXT: addi a0, sp, 296 ; CHECK-NEXT: addi a1, sp, 280 ; CHECK-NEXT: addi a2, sp, 264 -; CHECK-NEXT: lw a3, 68(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 280(sp) -; CHECK-NEXT: lw a3, 64(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 284(sp) -; CHECK-NEXT: lw a3, 60(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 288(sp) -; CHECK-NEXT: lw a3, 56(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 292(sp) ; CHECK-NEXT: call __subtf3 -; CHECK-NEXT: lw a0, 304(sp) -; CHECK-NEXT: lw a1, 308(sp) -; CHECK-NEXT: lw a2, 296(sp) -; CHECK-NEXT: lw a3, 300(sp) +; CHECK-NEXT: lw a0, 296(sp) +; CHECK-NEXT: lw a1, 300(sp) +; CHECK-NEXT: lw a2, 304(sp) +; CHECK-NEXT: lw a3, 308(sp) ; CHECK-NEXT: lui a4, %hi(T) -; CHECK-NEXT: sw a1, %lo(T+12)(a4) -; CHECK-NEXT: sw a0, %lo(T+8)(a4) -; CHECK-NEXT: sw a3, %lo(T+4)(a4) -; CHECK-NEXT: sw a2, %lo(T)(a4) +; CHECK-NEXT: sw a3, %lo(T+12)(a4) +; CHECK-NEXT: sw a2, %lo(T+8)(a4) +; CHECK-NEXT: sw a1, %lo(T+4)(a4) +; CHECK-NEXT: sw a0, %lo(T)(a4) ; CHECK-NEXT: sw zero, 168(sp) ; CHECK-NEXT: sw zero, 172(sp) ; CHECK-NEXT: sw zero, 176(sp) ; CHECK-NEXT: sw zero, 180(sp) +; CHECK-NEXT: sw s0, 184(sp) +; CHECK-NEXT: sw s6, 188(sp) +; CHECK-NEXT: sw s7, 192(sp) +; CHECK-NEXT: sw s8, 196(sp) ; CHECK-NEXT: addi a0, sp, 200 ; CHECK-NEXT: addi a1, sp, 184 ; CHECK-NEXT: addi a2, sp, 168 -; CHECK-NEXT: lw a3, 36(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 184(sp) -; CHECK-NEXT: lw a3, 32(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 188(sp) -; CHECK-NEXT: lw a3, 28(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 192(sp) -; CHECK-NEXT: lw a3, 24(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 196(sp) ; CHECK-NEXT: call __addtf3 -; CHECK-NEXT: lw a0, 208(sp) -; CHECK-NEXT: lw a1, 212(sp) -; CHECK-NEXT: lw a2, 200(sp) -; CHECK-NEXT: lw a3, 204(sp) +; CHECK-NEXT: lw a0, 200(sp) +; CHECK-NEXT: lw a1, 204(sp) +; CHECK-NEXT: lw a2, 208(sp) +; CHECK-NEXT: lw a3, 212(sp) ; CHECK-NEXT: lui a4, %hi(Y) -; CHECK-NEXT: sw a1, %lo(Y+12)(a4) -; CHECK-NEXT: sw a0, %lo(Y+8)(a4) -; CHECK-NEXT: sw a3, %lo(Y+4)(a4) -; CHECK-NEXT: sw a2, %lo(Y)(a4) +; CHECK-NEXT: sw a3, %lo(Y+12)(a4) +; CHECK-NEXT: sw a2, %lo(Y+8)(a4) +; CHECK-NEXT: sw a1, %lo(Y+4)(a4) +; CHECK-NEXT: sw a0, %lo(Y)(a4) ; CHECK-NEXT: sw zero, 120(sp) ; CHECK-NEXT: sw zero, 124(sp) ; CHECK-NEXT: sw zero, 128(sp) ; CHECK-NEXT: sw zero, 132(sp) +; CHECK-NEXT: sw s1, 136(sp) +; CHECK-NEXT: sw s2, 140(sp) +; CHECK-NEXT: sw s3, 144(sp) +; CHECK-NEXT: sw s4, 148(sp) ; CHECK-NEXT: addi a0, sp, 152 ; CHECK-NEXT: addi a1, sp, 136 ; CHECK-NEXT: addi a2, sp, 120 -; CHECK-NEXT: sw s5, 136(sp) -; CHECK-NEXT: sw s9, 140(sp) -; CHECK-NEXT: sw s10, 144(sp) -; CHECK-NEXT: sw s11, 148(sp) ; CHECK-NEXT: call __multf3 -; CHECK-NEXT: lw a3, 152(sp) -; CHECK-NEXT: lw a4, 156(sp) -; CHECK-NEXT: lw a5, 160(sp) -; CHECK-NEXT: lw a6, 164(sp) -; CHECK-NEXT: lui a2, 786400 +; CHECK-NEXT: lw a2, 152(sp) +; CHECK-NEXT: lw a3, 156(sp) +; CHECK-NEXT: lw a4, 160(sp) +; CHECK-NEXT: lw a5, 164(sp) +; CHECK-NEXT: lui a1, 786400 ; CHECK-NEXT: addi a0, sp, 104 -; CHECK-NEXT: addi a1, sp, 88 ; CHECK-NEXT: sw zero, 72(sp) ; CHECK-NEXT: sw zero, 76(sp) ; CHECK-NEXT: sw zero, 80(sp) -; CHECK-NEXT: sw a2, 84(sp) +; CHECK-NEXT: sw a1, 84(sp) +; CHECK-NEXT: addi a1, sp, 88 +; CHECK-NEXT: sw a2, 88(sp) +; CHECK-NEXT: sw a3, 92(sp) +; CHECK-NEXT: sw a4, 96(sp) +; CHECK-NEXT: sw a5, 100(sp) ; CHECK-NEXT: addi a2, sp, 72 -; CHECK-NEXT: sw a3, 88(sp) -; CHECK-NEXT: sw a4, 92(sp) -; CHECK-NEXT: sw a5, 96(sp) -; CHECK-NEXT: sw a6, 100(sp) ; CHECK-NEXT: call __addtf3 -; CHECK-NEXT: lw a0, 112(sp) -; CHECK-NEXT: lw a1, 116(sp) -; CHECK-NEXT: lw a2, 104(sp) -; CHECK-NEXT: lw a3, 108(sp) +; CHECK-NEXT: lw a0, 104(sp) +; CHECK-NEXT: lw a1, 108(sp) +; CHECK-NEXT: lw a2, 112(sp) +; CHECK-NEXT: lw a3, 116(sp) ; CHECK-NEXT: lui a4, %hi(Y1) -; CHECK-NEXT: sw a0, %lo(Y1+8)(a4) -; CHECK-NEXT: sw a1, %lo(Y1+12)(a4) -; CHECK-NEXT: sw a2, %lo(Y1)(a4) -; CHECK-NEXT: sw a3, %lo(Y1+4)(a4) +; CHECK-NEXT: sw a2, %lo(Y1+8)(a4) +; CHECK-NEXT: sw a3, %lo(Y1+12)(a4) +; CHECK-NEXT: sw a0, %lo(Y1)(a4) +; CHECK-NEXT: sw a1, %lo(Y1+4)(a4) ; CHECK-NEXT: lw ra, 700(sp) # 4-byte Folded Reload ; CHECK-NEXT: lw s0, 696(sp) # 4-byte Folded Reload ; CHECK-NEXT: lw s1, 692(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll index 366b37ac5d472..a6acb2827acea 100644 --- a/llvm/test/CodeGen/RISCV/tail-calls.ll +++ b/llvm/test/CodeGen/RISCV/tail-calls.ll @@ -30,27 +30,23 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1) define void @caller_extern(ptr %src) optsize { ; CHECK-LABEL: caller_extern: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a1, %hi(dest) -; CHECK-NEXT: addi a1, a1, %lo(dest) +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: lui a0, %hi(dest) +; CHECK-NEXT: addi a0, a0, %lo(dest) ; CHECK-NEXT: li a2, 7 -; CHECK-NEXT: mv a3, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: mv a1, a3 ; CHECK-NEXT: tail memcpy ; ; CHECK-LARGE-ZICFILP-LABEL: caller_extern: ; CHECK-LARGE-ZICFILP: # %bb.0: # %entry ; CHECK-LARGE-ZICFILP-NEXT: lpad 0 +; CHECK-LARGE-ZICFILP-NEXT: mv a1, a0 ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi1: -; CHECK-LARGE-ZICFILP-NEXT: auipc a1, %pcrel_hi(.LCPI1_0) +; CHECK-LARGE-ZICFILP-NEXT: auipc a0, %pcrel_hi(.LCPI1_0) ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi2: ; CHECK-LARGE-ZICFILP-NEXT: auipc a2, %pcrel_hi(.LCPI1_1) -; CHECK-LARGE-ZICFILP-NEXT: lw a1, %pcrel_lo(.Lpcrel_hi1)(a1) +; CHECK-LARGE-ZICFILP-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi1)(a0) ; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi2)(a2) ; CHECK-LARGE-ZICFILP-NEXT: li a2, 7 -; CHECK-LARGE-ZICFILP-NEXT: mv a3, a0 -; CHECK-LARGE-ZICFILP-NEXT: mv a0, a1 -; CHECK-LARGE-ZICFILP-NEXT: mv a1, a3 ; CHECK-LARGE-ZICFILP-NEXT: jr t2 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr @dest, ptr %src, i32 7, i1 false) @@ -62,27 +58,23 @@ entry: define void @caller_extern_pgso(ptr %src) !prof !14 { ; CHECK-LABEL: caller_extern_pgso: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a1, %hi(dest_pgso) -; CHECK-NEXT: addi a1, a1, %lo(dest_pgso) +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: lui a0, %hi(dest_pgso) +; CHECK-NEXT: addi a0, a0, %lo(dest_pgso) ; CHECK-NEXT: li a2, 7 -; CHECK-NEXT: mv a3, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: mv a1, a3 ; CHECK-NEXT: tail memcpy ; ; CHECK-LARGE-ZICFILP-LABEL: caller_extern_pgso: ; CHECK-LARGE-ZICFILP: # %bb.0: # %entry ; CHECK-LARGE-ZICFILP-NEXT: lpad 0 +; CHECK-LARGE-ZICFILP-NEXT: mv a1, a0 ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi3: -; CHECK-LARGE-ZICFILP-NEXT: auipc a1, %pcrel_hi(.LCPI2_0) +; CHECK-LARGE-ZICFILP-NEXT: auipc a0, %pcrel_hi(.LCPI2_0) ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi4: ; CHECK-LARGE-ZICFILP-NEXT: auipc a2, %pcrel_hi(.LCPI2_1) -; CHECK-LARGE-ZICFILP-NEXT: lw a1, %pcrel_lo(.Lpcrel_hi3)(a1) +; CHECK-LARGE-ZICFILP-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi3)(a0) ; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi4)(a2) ; CHECK-LARGE-ZICFILP-NEXT: li a2, 7 -; CHECK-LARGE-ZICFILP-NEXT: mv a3, a0 -; CHECK-LARGE-ZICFILP-NEXT: mv a0, a1 -; CHECK-LARGE-ZICFILP-NEXT: mv a1, a3 ; CHECK-LARGE-ZICFILP-NEXT: jr t2 entry: tail call void @llvm.memcpy.p0.p0.i32(ptr @dest_pgso, ptr %src, i32 7, i1 false) @@ -181,10 +173,10 @@ define void @caller_varargs(i32 %a, i32 %b) nounwind { ; CHECK-LARGE-ZICFILP-NEXT: lpad 0 ; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, -16 ; CHECK-LARGE-ZICFILP-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; CHECK-LARGE-ZICFILP-NEXT: sw a0, 0(sp) ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi7: ; CHECK-LARGE-ZICFILP-NEXT: auipc a2, %pcrel_hi(.LCPI5_0) ; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi7)(a2) -; CHECK-LARGE-ZICFILP-NEXT: sw a0, 0(sp) ; CHECK-LARGE-ZICFILP-NEXT: mv a2, a1 ; CHECK-LARGE-ZICFILP-NEXT: mv a3, a0 ; CHECK-LARGE-ZICFILP-NEXT: mv a4, a0 @@ -231,19 +223,19 @@ define i32 @caller_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g ; CHECK-LARGE-ZICFILP-NEXT: sw ra, 28(sp) # 4-byte Folded Spill ; CHECK-LARGE-ZICFILP-NEXT: lw t0, 32(sp) ; CHECK-LARGE-ZICFILP-NEXT: lw t1, 36(sp) -; CHECK-LARGE-ZICFILP-NEXT: lw t3, 40(sp) -; CHECK-LARGE-ZICFILP-NEXT: lw t4, 44(sp) -; CHECK-LARGE-ZICFILP-NEXT: lw t2, 48(sp) +; CHECK-LARGE-ZICFILP-NEXT: lw t2, 40(sp) +; CHECK-LARGE-ZICFILP-NEXT: lw t3, 44(sp) +; CHECK-LARGE-ZICFILP-NEXT: lw t4, 48(sp) ; CHECK-LARGE-ZICFILP-NEXT: lw t5, 52(sp) -; CHECK-LARGE-ZICFILP-NEXT: sw t2, 16(sp) +; CHECK-LARGE-ZICFILP-NEXT: sw t4, 16(sp) ; CHECK-LARGE-ZICFILP-NEXT: sw t5, 20(sp) -; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi8: -; CHECK-LARGE-ZICFILP-NEXT: auipc t2, %pcrel_hi(.LCPI6_0) -; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi8)(t2) ; CHECK-LARGE-ZICFILP-NEXT: sw t0, 0(sp) ; CHECK-LARGE-ZICFILP-NEXT: sw t1, 4(sp) -; CHECK-LARGE-ZICFILP-NEXT: sw t3, 8(sp) -; CHECK-LARGE-ZICFILP-NEXT: sw t4, 12(sp) +; CHECK-LARGE-ZICFILP-NEXT: sw t2, 8(sp) +; CHECK-LARGE-ZICFILP-NEXT: sw t3, 12(sp) +; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi8: +; CHECK-LARGE-ZICFILP-NEXT: auipc t0, %pcrel_hi(.LCPI6_0) +; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi8)(t0) ; CHECK-LARGE-ZICFILP-NEXT: jalr t2 ; CHECK-LARGE-ZICFILP-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, 32 @@ -260,12 +252,12 @@ define void @caller_indirect_args() nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi sp, sp, -32 ; CHECK-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; CHECK-NEXT: lui a1, 262128 -; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: lui a0, 262128 ; CHECK-NEXT: sw zero, 0(sp) ; CHECK-NEXT: sw zero, 4(sp) ; CHECK-NEXT: sw zero, 8(sp) -; CHECK-NEXT: sw a1, 12(sp) +; CHECK-NEXT: sw a0, 12(sp) +; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: call callee_indirect_args ; CHECK-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 32 @@ -276,15 +268,15 @@ define void @caller_indirect_args() nounwind { ; CHECK-LARGE-ZICFILP-NEXT: lpad 0 ; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, -32 ; CHECK-LARGE-ZICFILP-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; CHECK-LARGE-ZICFILP-NEXT: lui a1, 262128 +; CHECK-LARGE-ZICFILP-NEXT: lui a0, 262128 ; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi9: -; CHECK-LARGE-ZICFILP-NEXT: auipc a0, %pcrel_hi(.LCPI7_0) -; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi9)(a0) -; CHECK-LARGE-ZICFILP-NEXT: mv a0, sp +; CHECK-LARGE-ZICFILP-NEXT: auipc a1, %pcrel_hi(.LCPI7_0) ; CHECK-LARGE-ZICFILP-NEXT: sw zero, 0(sp) ; CHECK-LARGE-ZICFILP-NEXT: sw zero, 4(sp) ; CHECK-LARGE-ZICFILP-NEXT: sw zero, 8(sp) -; CHECK-LARGE-ZICFILP-NEXT: sw a1, 12(sp) +; CHECK-LARGE-ZICFILP-NEXT: sw a0, 12(sp) +; CHECK-LARGE-ZICFILP-NEXT: lw t2, %pcrel_lo(.Lpcrel_hi9)(a1) +; CHECK-LARGE-ZICFILP-NEXT: mv a0, sp ; CHECK-LARGE-ZICFILP-NEXT: jalr t2 ; CHECK-LARGE-ZICFILP-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, 32 diff --git a/llvm/test/CodeGen/RISCV/ucmp.ll b/llvm/test/CodeGen/RISCV/ucmp.ll index 50da56fbc5951..e28d98bf3047e 100644 --- a/llvm/test/CodeGen/RISCV/ucmp.ll +++ b/llvm/test/CodeGen/RISCV/ucmp.ll @@ -89,15 +89,15 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { ; RV32I-NEXT: lw a2, 4(a1) ; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: lw a5, 12(a1) -; RV32I-NEXT: lw a6, 12(a0) ; RV32I-NEXT: lw a3, 4(a0) -; RV32I-NEXT: lw a7, 8(a0) -; RV32I-NEXT: beq a6, a5, .LBB4_2 +; RV32I-NEXT: lw a6, 8(a0) +; RV32I-NEXT: lw a7, 12(a0) +; RV32I-NEXT: beq a7, a5, .LBB4_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t2, a6, a5 +; RV32I-NEXT: sltu t2, a7, a5 ; RV32I-NEXT: j .LBB4_3 ; RV32I-NEXT: .LBB4_2: -; RV32I-NEXT: sltu t2, a7, a4 +; RV32I-NEXT: sltu t2, a6, a4 ; RV32I-NEXT: .LBB4_3: ; RV32I-NEXT: lw a1, 0(a1) ; RV32I-NEXT: lw t0, 0(a0) @@ -108,23 +108,23 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { ; RV32I-NEXT: .LBB4_5: ; RV32I-NEXT: sltu a0, t0, a1 ; RV32I-NEXT: .LBB4_6: -; RV32I-NEXT: xor t1, a6, a5 -; RV32I-NEXT: xor t3, a7, a4 +; RV32I-NEXT: xor t1, a7, a5 +; RV32I-NEXT: xor t3, a6, a4 ; RV32I-NEXT: or t1, t3, t1 ; RV32I-NEXT: beqz t1, .LBB4_8 ; RV32I-NEXT: # %bb.7: ; RV32I-NEXT: mv a0, t2 ; RV32I-NEXT: .LBB4_8: -; RV32I-NEXT: beq a6, a5, .LBB4_11 +; RV32I-NEXT: beq a7, a5, .LBB4_11 ; RV32I-NEXT: # %bb.9: -; RV32I-NEXT: sltu a4, a5, a6 +; RV32I-NEXT: sltu a4, a5, a7 ; RV32I-NEXT: bne a3, a2, .LBB4_12 ; RV32I-NEXT: .LBB4_10: ; RV32I-NEXT: sltu a1, a1, t0 ; RV32I-NEXT: bnez t1, .LBB4_13 ; RV32I-NEXT: j .LBB4_14 ; RV32I-NEXT: .LBB4_11: -; RV32I-NEXT: sltu a4, a4, a7 +; RV32I-NEXT: sltu a4, a4, a6 ; RV32I-NEXT: beq a3, a2, .LBB4_10 ; RV32I-NEXT: .LBB4_12: ; RV32I-NEXT: sltu a1, a2, a3 diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll index 1cdfaa5c4154b..01a8a66f53f15 100644 --- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll +++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll @@ -26,10 +26,10 @@ define i8 @load_i8(ptr %p) { define i16 @load_i16(ptr %p) { ; SLOW-LABEL: load_i16: ; SLOW: # %bb.0: -; SLOW-NEXT: lbu a1, 1(a0) -; SLOW-NEXT: lbu a0, 0(a0) -; SLOW-NEXT: slli a1, a1, 8 -; SLOW-NEXT: or a0, a1, a0 +; SLOW-NEXT: lbu a1, 0(a0) +; SLOW-NEXT: lbu a0, 1(a0) +; SLOW-NEXT: slli a0, a0, 8 +; SLOW-NEXT: or a0, a0, a1 ; SLOW-NEXT: ret ; ; FAST-LABEL: load_i16: @@ -43,11 +43,11 @@ define i16 @load_i16(ptr %p) { define i24 @load_i24(ptr %p) { ; SLOWBASE-LABEL: load_i24: ; SLOWBASE: # %bb.0: -; SLOWBASE-NEXT: lbu a1, 1(a0) -; SLOWBASE-NEXT: lbu a2, 0(a0) +; SLOWBASE-NEXT: lbu a1, 0(a0) +; SLOWBASE-NEXT: lbu a2, 1(a0) ; SLOWBASE-NEXT: lbu a0, 2(a0) -; SLOWBASE-NEXT: slli a1, a1, 8 -; SLOWBASE-NEXT: or a1, a1, a2 +; SLOWBASE-NEXT: slli a2, a2, 8 +; SLOWBASE-NEXT: or a1, a2, a1 ; SLOWBASE-NEXT: slli a0, a0, 16 ; SLOWBASE-NEXT: or a0, a1, a0 ; SLOWBASE-NEXT: ret @@ -73,10 +73,10 @@ define i24 @load_i24(ptr %p) { ; ; FAST-LABEL: load_i24: ; FAST: # %bb.0: -; FAST-NEXT: lbu a1, 2(a0) -; FAST-NEXT: lhu a0, 0(a0) -; FAST-NEXT: slli a1, a1, 16 -; FAST-NEXT: or a0, a0, a1 +; FAST-NEXT: lhu a1, 0(a0) +; FAST-NEXT: lbu a0, 2(a0) +; FAST-NEXT: slli a0, a0, 16 +; FAST-NEXT: or a0, a1, a0 ; FAST-NEXT: ret %res = load i24, ptr %p, align 1 ret i24 %res @@ -85,12 +85,12 @@ define i24 @load_i24(ptr %p) { define i32 @load_i32(ptr %p) { ; SLOWBASE-LABEL: load_i32: ; SLOWBASE: # %bb.0: -; SLOWBASE-NEXT: lbu a1, 1(a0) -; SLOWBASE-NEXT: lbu a2, 0(a0) +; SLOWBASE-NEXT: lbu a1, 0(a0) +; SLOWBASE-NEXT: lbu a2, 1(a0) ; SLOWBASE-NEXT: lbu a3, 2(a0) ; SLOWBASE-NEXT: lbu a0, 3(a0) -; SLOWBASE-NEXT: slli a1, a1, 8 -; SLOWBASE-NEXT: or a1, a1, a2 +; SLOWBASE-NEXT: slli a2, a2, 8 +; SLOWBASE-NEXT: or a1, a2, a1 ; SLOWBASE-NEXT: slli a3, a3, 16 ; SLOWBASE-NEXT: slli a0, a0, 24 ; SLOWBASE-NEXT: or a0, a0, a3 @@ -99,13 +99,13 @@ define i32 @load_i32(ptr %p) { ; ; RV32IZBKB-LABEL: load_i32: ; RV32IZBKB: # %bb.0: -; RV32IZBKB-NEXT: lbu a1, 1(a0) -; RV32IZBKB-NEXT: lbu a2, 2(a0) -; RV32IZBKB-NEXT: lbu a3, 3(a0) -; RV32IZBKB-NEXT: lbu a0, 0(a0) -; RV32IZBKB-NEXT: packh a2, a2, a3 -; RV32IZBKB-NEXT: packh a0, a0, a1 -; RV32IZBKB-NEXT: pack a0, a0, a2 +; RV32IZBKB-NEXT: lbu a1, 0(a0) +; RV32IZBKB-NEXT: lbu a2, 1(a0) +; RV32IZBKB-NEXT: lbu a3, 2(a0) +; RV32IZBKB-NEXT: lbu a0, 3(a0) +; RV32IZBKB-NEXT: packh a0, a3, a0 +; RV32IZBKB-NEXT: packh a1, a1, a2 +; RV32IZBKB-NEXT: pack a0, a1, a0 ; RV32IZBKB-NEXT: ret ; ; RV64IZBKB-LABEL: load_i32: @@ -132,50 +132,50 @@ define i32 @load_i32(ptr %p) { define i64 @load_i64(ptr %p) { ; RV32I-LABEL: load_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a1, 1(a0) -; RV32I-NEXT: lbu a2, 2(a0) -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: lbu a4, 0(a0) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: slli a2, a2, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: lbu a4, 4(a0) -; RV32I-NEXT: lbu a5, 5(a0) -; RV32I-NEXT: or a2, a3, a2 -; RV32I-NEXT: lbu a3, 6(a0) -; RV32I-NEXT: lbu a0, 7(a0) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: lbu a2, 1(a0) +; RV32I-NEXT: lbu a3, 2(a0) +; RV32I-NEXT: lbu a4, 3(a0) +; RV32I-NEXT: slli a2, a2, 8 ; RV32I-NEXT: slli a3, a3, 16 +; RV32I-NEXT: slli a4, a4, 24 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a2, 4(a0) +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 6(a0) +; RV32I-NEXT: lbu a0, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a2, a4, a2 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a3, a0, a3 -; RV32I-NEXT: or a0, a2, a1 -; RV32I-NEXT: or a1, a3, a4 +; RV32I-NEXT: or a5, a0, a5 +; RV32I-NEXT: or a0, a3, a1 +; RV32I-NEXT: or a1, a5, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: load_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a1, 1(a0) -; RV64I-NEXT: lbu a2, 2(a0) -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: lbu a4, 0(a0) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: slli a2, a2, 16 -; RV64I-NEXT: slli a3, a3, 24 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: lbu a4, 4(a0) -; RV64I-NEXT: lbu a5, 5(a0) -; RV64I-NEXT: or a2, a3, a2 -; RV64I-NEXT: lbu a3, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a1, 0(a0) +; RV64I-NEXT: lbu a2, 1(a0) +; RV64I-NEXT: lbu a3, 2(a0) +; RV64I-NEXT: lbu a4, 3(a0) +; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: slli a3, a3, 16 -; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: slli a4, a4, 24 ; RV64I-NEXT: or a1, a2, a1 -; RV64I-NEXT: or a0, a0, a4 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a2, 4(a0) +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a2, a4, a2 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret @@ -186,16 +186,16 @@ define i64 @load_i64(ptr %p) { ; RV32IZBKB-NEXT: lbu a2, 1(a0) ; RV32IZBKB-NEXT: lbu a3, 2(a0) ; RV32IZBKB-NEXT: lbu a4, 3(a0) -; RV32IZBKB-NEXT: lbu a5, 5(a0) -; RV32IZBKB-NEXT: lbu a6, 6(a0) -; RV32IZBKB-NEXT: lbu a7, 7(a0) -; RV32IZBKB-NEXT: lbu a0, 4(a0) ; RV32IZBKB-NEXT: packh a3, a3, a4 ; RV32IZBKB-NEXT: packh a1, a1, a2 -; RV32IZBKB-NEXT: packh a2, a6, a7 -; RV32IZBKB-NEXT: packh a4, a0, a5 +; RV32IZBKB-NEXT: lbu a2, 4(a0) +; RV32IZBKB-NEXT: lbu a4, 5(a0) +; RV32IZBKB-NEXT: lbu a5, 6(a0) +; RV32IZBKB-NEXT: lbu a0, 7(a0) +; RV32IZBKB-NEXT: packh a5, a5, a0 +; RV32IZBKB-NEXT: packh a2, a2, a4 ; RV32IZBKB-NEXT: pack a0, a1, a3 -; RV32IZBKB-NEXT: pack a1, a4, a2 +; RV32IZBKB-NEXT: pack a1, a2, a5 ; RV32IZBKB-NEXT: ret ; ; RV64IZBKB-LABEL: load_i64: @@ -204,14 +204,14 @@ define i64 @load_i64(ptr %p) { ; RV64IZBKB-NEXT: lbu a2, 5(a0) ; RV64IZBKB-NEXT: lbu a3, 6(a0) ; RV64IZBKB-NEXT: lbu a4, 7(a0) -; RV64IZBKB-NEXT: lbu a5, 0(a0) -; RV64IZBKB-NEXT: lbu a6, 1(a0) -; RV64IZBKB-NEXT: lbu a7, 2(a0) -; RV64IZBKB-NEXT: lbu a0, 3(a0) ; RV64IZBKB-NEXT: packh a1, a1, a2 ; RV64IZBKB-NEXT: packh a2, a3, a4 -; RV64IZBKB-NEXT: packh a3, a5, a6 -; RV64IZBKB-NEXT: packh a0, a7, a0 +; RV64IZBKB-NEXT: lbu a3, 0(a0) +; RV64IZBKB-NEXT: lbu a4, 1(a0) +; RV64IZBKB-NEXT: lbu a5, 2(a0) +; RV64IZBKB-NEXT: lbu a0, 3(a0) +; RV64IZBKB-NEXT: packh a3, a3, a4 +; RV64IZBKB-NEXT: packh a0, a5, a0 ; RV64IZBKB-NEXT: slli a2, a2, 16 ; RV64IZBKB-NEXT: slli a0, a0, 16 ; RV64IZBKB-NEXT: or a1, a2, a1 diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index c73a18c8869d5..106acff8fab95 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -331,13 +331,13 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 -; RV32-NEXT: lbu a0, 4(a0) -; RV32-NEXT: lw a1, 0(s0) -; RV32-NEXT: slli a0, a0, 10 -; RV32-NEXT: srli s1, a1, 22 -; RV32-NEXT: or s1, s1, a0 -; RV32-NEXT: srli s2, a1, 11 -; RV32-NEXT: andi a0, a1, 2047 +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lbu a1, 4(s0) +; RV32-NEXT: slli a1, a1, 10 +; RV32-NEXT: srli s1, a0, 22 +; RV32-NEXT: or s1, s1, a1 +; RV32-NEXT: srli s2, a0, 11 +; RV32-NEXT: andi a0, a0, 2047 ; RV32-NEXT: li a1, 683 ; RV32-NEXT: call __mulsi3 ; RV32-NEXT: slli a1, a0, 10 @@ -388,10 +388,10 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: mv s0, a0 -; RV64-NEXT: lbu a0, 4(a0) -; RV64-NEXT: lwu a1, 0(s0) -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: lwu a0, 0(a0) +; RV64-NEXT: lbu a1, 4(s0) +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: srli s1, a0, 22 ; RV64-NEXT: srli s2, a0, 11 ; RV64-NEXT: andi a0, a0, 2047 @@ -438,40 +438,40 @@ define void @test_urem_vec(ptr %X) nounwind { ; ; RV32M-LABEL: test_urem_vec: ; RV32M: # %bb.0: -; RV32M-NEXT: lbu a1, 4(a0) -; RV32M-NEXT: lw a2, 0(a0) +; RV32M-NEXT: lw a1, 0(a0) +; RV32M-NEXT: lbu a2, 4(a0) ; RV32M-NEXT: li a3, 683 ; RV32M-NEXT: li a4, 819 -; RV32M-NEXT: slli a1, a1, 10 -; RV32M-NEXT: srli a5, a2, 22 -; RV32M-NEXT: or a1, a5, a1 -; RV32M-NEXT: andi a5, a2, 2047 +; RV32M-NEXT: slli a2, a2, 10 +; RV32M-NEXT: srli a5, a1, 22 +; RV32M-NEXT: or a2, a5, a2 +; RV32M-NEXT: andi a5, a1, 2047 ; RV32M-NEXT: mul a3, a5, a3 ; RV32M-NEXT: li a5, 1463 -; RV32M-NEXT: srli a2, a2, 11 -; RV32M-NEXT: mul a2, a2, a5 +; RV32M-NEXT: srli a1, a1, 11 +; RV32M-NEXT: mul a1, a1, a5 ; RV32M-NEXT: slli a5, a3, 10 ; RV32M-NEXT: slli a3, a3, 21 -; RV32M-NEXT: mul a1, a1, a4 -; RV32M-NEXT: addi a2, a2, -1463 +; RV32M-NEXT: mul a2, a2, a4 +; RV32M-NEXT: addi a1, a1, -1463 ; RV32M-NEXT: srli a3, a3, 22 -; RV32M-NEXT: addi a1, a1, -1638 -; RV32M-NEXT: andi a2, a2, 2047 -; RV32M-NEXT: or a3, a3, a5 +; RV32M-NEXT: addi a2, a2, -1638 ; RV32M-NEXT: andi a1, a1, 2047 -; RV32M-NEXT: sltiu a2, a2, 293 +; RV32M-NEXT: or a3, a3, a5 +; RV32M-NEXT: andi a2, a2, 2047 +; RV32M-NEXT: sltiu a1, a1, 293 ; RV32M-NEXT: andi a3, a3, 2047 -; RV32M-NEXT: sltiu a1, a1, 2 -; RV32M-NEXT: addi a2, a2, -1 -; RV32M-NEXT: sltiu a3, a3, 342 -; RV32M-NEXT: xori a4, a1, 1 +; RV32M-NEXT: sltiu a2, a2, 2 ; RV32M-NEXT: addi a1, a1, -1 -; RV32M-NEXT: andi a2, a2, 2047 +; RV32M-NEXT: sltiu a3, a3, 342 +; RV32M-NEXT: xori a4, a2, 1 +; RV32M-NEXT: addi a2, a2, -1 +; RV32M-NEXT: andi a1, a1, 2047 ; RV32M-NEXT: addi a3, a3, -1 -; RV32M-NEXT: slli a2, a2, 11 -; RV32M-NEXT: slli a1, a1, 22 +; RV32M-NEXT: slli a1, a1, 11 +; RV32M-NEXT: slli a2, a2, 22 ; RV32M-NEXT: andi a3, a3, 2047 -; RV32M-NEXT: or a1, a2, a1 +; RV32M-NEXT: or a1, a1, a2 ; RV32M-NEXT: or a1, a3, a1 ; RV32M-NEXT: sw a1, 0(a0) ; RV32M-NEXT: sb a4, 4(a0) @@ -479,12 +479,12 @@ define void @test_urem_vec(ptr %X) nounwind { ; ; RV64M-LABEL: test_urem_vec: ; RV64M: # %bb.0: -; RV64M-NEXT: lbu a1, 4(a0) -; RV64M-NEXT: lwu a2, 0(a0) +; RV64M-NEXT: lwu a1, 0(a0) +; RV64M-NEXT: lbu a2, 4(a0) ; RV64M-NEXT: li a3, 683 ; RV64M-NEXT: li a4, 1463 -; RV64M-NEXT: slli a1, a1, 32 -; RV64M-NEXT: or a1, a2, a1 +; RV64M-NEXT: slli a2, a2, 32 +; RV64M-NEXT: or a1, a1, a2 ; RV64M-NEXT: andi a2, a1, 2047 ; RV64M-NEXT: mul a2, a2, a3 ; RV64M-NEXT: srli a3, a1, 11 @@ -538,15 +538,9 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32MV-NEXT: srli a1, a1, 21 ; RV32MV-NEXT: vslide1down.vx v10, v10, a1 ; RV32MV-NEXT: li a1, 2047 -; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV32MV-NEXT: vmv.v.i v11, 1 +; RV32MV-NEXT: addi a3, a3, -1527 ; RV32MV-NEXT: andi a2, a2, 2047 -; RV32MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV32MV-NEXT: vslide1down.vx v10, v10, a2 -; RV32MV-NEXT: lui a2, %hi(.LCPI4_1) -; RV32MV-NEXT: addi a2, a2, %lo(.LCPI4_1) -; RV32MV-NEXT: addi a3, a3, -1527 -; RV32MV-NEXT: vsext.vf2 v12, v11 ; RV32MV-NEXT: vslidedown.vi v10, v10, 1 ; RV32MV-NEXT: vsub.vv v8, v10, v8 ; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -556,14 +550,20 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32MV-NEXT: vmul.vv v8, v8, v9 ; RV32MV-NEXT: vadd.vv v9, v8, v8 ; RV32MV-NEXT: vsll.vv v9, v9, v11 -; RV32MV-NEXT: vle16.v v10, (a2) +; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV32MV-NEXT: vmv.v.i v10, 1 +; RV32MV-NEXT: lui a2, %hi(.LCPI4_1) +; RV32MV-NEXT: addi a2, a2, %lo(.LCPI4_1) +; RV32MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV32MV-NEXT: vsext.vf2 v11, v10 ; RV32MV-NEXT: vand.vx v8, v8, a1 -; RV32MV-NEXT: vsrl.vv v8, v8, v12 +; RV32MV-NEXT: vsrl.vv v8, v8, v11 +; RV32MV-NEXT: vmv.v.i v10, 0 ; RV32MV-NEXT: vor.vv v8, v8, v9 +; RV32MV-NEXT: vle16.v v9, (a2) ; RV32MV-NEXT: vand.vx v8, v8, a1 -; RV32MV-NEXT: vmsltu.vv v0, v10, v8 -; RV32MV-NEXT: vmv.v.i v8, 0 -; RV32MV-NEXT: vmerge.vim v8, v8, -1, v0 +; RV32MV-NEXT: vmsltu.vv v0, v9, v8 +; RV32MV-NEXT: vmerge.vim v8, v10, -1, v0 ; RV32MV-NEXT: vslidedown.vi v9, v8, 2 ; RV32MV-NEXT: vmv.x.s a1, v8 ; RV32MV-NEXT: vslidedown.vi v8, v8, 1 @@ -599,15 +599,9 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64MV-NEXT: srli a2, a2, 53 ; RV64MV-NEXT: vslide1down.vx v10, v10, a2 ; RV64MV-NEXT: li a2, 2047 -; RV64MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64MV-NEXT: vmv.v.i v11, 1 +; RV64MV-NEXT: addi a3, a3, -1527 ; RV64MV-NEXT: srli a1, a1, 22 -; RV64MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64MV-NEXT: vslide1down.vx v10, v10, a1 -; RV64MV-NEXT: lui a1, %hi(.LCPI4_1) -; RV64MV-NEXT: addi a1, a1, %lo(.LCPI4_1) -; RV64MV-NEXT: addi a3, a3, -1527 -; RV64MV-NEXT: vsext.vf2 v12, v11 ; RV64MV-NEXT: vslidedown.vi v10, v10, 1 ; RV64MV-NEXT: vsub.vv v8, v10, v8 ; RV64MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -617,14 +611,20 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64MV-NEXT: vmul.vv v8, v8, v9 ; RV64MV-NEXT: vadd.vv v9, v8, v8 ; RV64MV-NEXT: vsll.vv v9, v9, v11 -; RV64MV-NEXT: vle16.v v10, (a1) +; RV64MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64MV-NEXT: vmv.v.i v10, 1 +; RV64MV-NEXT: lui a1, %hi(.LCPI4_1) +; RV64MV-NEXT: addi a1, a1, %lo(.LCPI4_1) +; RV64MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64MV-NEXT: vsext.vf2 v11, v10 ; RV64MV-NEXT: vand.vx v8, v8, a2 -; RV64MV-NEXT: vsrl.vv v8, v8, v12 +; RV64MV-NEXT: vsrl.vv v8, v8, v11 +; RV64MV-NEXT: vmv.v.i v10, 0 ; RV64MV-NEXT: vor.vv v8, v8, v9 +; RV64MV-NEXT: vle16.v v9, (a1) ; RV64MV-NEXT: vand.vx v8, v8, a2 -; RV64MV-NEXT: vmsltu.vv v0, v10, v8 -; RV64MV-NEXT: vmv.v.i v8, 0 -; RV64MV-NEXT: vmerge.vim v8, v8, -1, v0 +; RV64MV-NEXT: vmsltu.vv v0, v9, v8 +; RV64MV-NEXT: vmerge.vim v8, v10, -1, v0 ; RV64MV-NEXT: vmv.x.s a1, v8 ; RV64MV-NEXT: vslidedown.vi v9, v8, 1 ; RV64MV-NEXT: vslidedown.vi v8, v8, 2 diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index 988856ca70923..c9d9ed13faa08 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -19,30 +19,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lhu a2, 0(a1) -; RV32I-NEXT: lhu s0, 4(a1) -; RV32I-NEXT: lhu s1, 8(a1) -; RV32I-NEXT: lhu s2, 12(a1) -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a0, 0(a1) +; RV32I-NEXT: lhu s1, 4(a1) +; RV32I-NEXT: lhu s2, 8(a1) +; RV32I-NEXT: lhu s3, 12(a1) ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 124 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: li a1, 98 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: li a1, 1003 +; RV32I-NEXT: li a1, 98 ; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: sh s4, 0(s3) -; RV32I-NEXT: sh s0, 2(s3) -; RV32I-NEXT: sh s1, 4(s3) -; RV32I-NEXT: sh a0, 6(s3) +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: li a1, 1003 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: sh s4, 0(s0) +; RV32I-NEXT: sh s1, 2(s0) +; RV32I-NEXT: sh s2, 4(s0) +; RV32I-NEXT: sh a0, 6(s0) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -98,30 +97,29 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lhu a2, 0(a1) -; RV64I-NEXT: lhu s0, 8(a1) -; RV64I-NEXT: lhu s1, 16(a1) -; RV64I-NEXT: lhu s2, 24(a1) -; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a0, 0(a1) +; RV64I-NEXT: lhu s1, 8(a1) +; RV64I-NEXT: lhu s2, 16(a1) +; RV64I-NEXT: lhu s3, 24(a1) ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 124 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: li a1, 98 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: li a1, 1003 +; RV64I-NEXT: li a1, 98 ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: sh s4, 0(s3) -; RV64I-NEXT: sh s0, 2(s3) -; RV64I-NEXT: sh s1, 4(s3) -; RV64I-NEXT: sh a0, 6(s3) +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: li a1, 1003 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: sh s4, 0(s0) +; RV64I-NEXT: sh s1, 2(s0) +; RV64I-NEXT: sh s2, 4(s0) +; RV64I-NEXT: sh a0, 6(s0) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -140,18 +138,18 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; RV64IM-NEXT: lui a5, %hi(.LCPI0_0) ; RV64IM-NEXT: lui a6, %hi(.LCPI0_1) ; RV64IM-NEXT: li a7, 95 -; RV64IM-NEXT: ld a6, %lo(.LCPI0_1)(a6) ; RV64IM-NEXT: lui t0, %hi(.LCPI0_2) ; RV64IM-NEXT: li t1, 98 -; RV64IM-NEXT: ld t0, %lo(.LCPI0_2)(t0) +; RV64IM-NEXT: ld a6, %lo(.LCPI0_1)(a6) ; RV64IM-NEXT: mulhu a6, a2, a6 ; RV64IM-NEXT: mul a6, a6, a7 ; RV64IM-NEXT: lui a7, %hi(.LCPI0_3) -; RV64IM-NEXT: ld a5, %lo(.LCPI0_0)(a5) -; RV64IM-NEXT: ld a7, %lo(.LCPI0_3)(a7) +; RV64IM-NEXT: ld t0, %lo(.LCPI0_2)(t0) ; RV64IM-NEXT: mulhu t0, a4, t0 ; RV64IM-NEXT: mul t0, t0, t1 ; RV64IM-NEXT: li t1, 1003 +; RV64IM-NEXT: ld a5, %lo(.LCPI0_0)(a5) +; RV64IM-NEXT: ld a7, %lo(.LCPI0_3)(a7) ; RV64IM-NEXT: mulhu a5, a3, a5 ; RV64IM-NEXT: mulhu a7, a1, a7 ; RV64IM-NEXT: mul a7, a7, t1 @@ -181,30 +179,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lhu a2, 0(a1) -; RV32I-NEXT: lhu s0, 4(a1) -; RV32I-NEXT: lhu s1, 8(a1) -; RV32I-NEXT: lhu s2, 12(a1) -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a0, 0(a1) +; RV32I-NEXT: lhu s1, 4(a1) +; RV32I-NEXT: lhu s2, 8(a1) +; RV32I-NEXT: lhu s3, 12(a1) ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: sh s4, 0(s3) -; RV32I-NEXT: sh s0, 2(s3) -; RV32I-NEXT: sh s1, 4(s3) -; RV32I-NEXT: sh a0, 6(s3) +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: li a1, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: sh s4, 0(s0) +; RV32I-NEXT: sh s1, 2(s0) +; RV32I-NEXT: sh s2, 4(s0) +; RV32I-NEXT: sh a0, 6(s0) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -250,30 +247,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: lhu a2, 0(a1) -; RV64I-NEXT: lhu s0, 8(a1) -; RV64I-NEXT: lhu s1, 16(a1) -; RV64I-NEXT: lhu s2, 24(a1) -; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a0, 0(a1) +; RV64I-NEXT: lhu s1, 8(a1) +; RV64I-NEXT: lhu s2, 16(a1) +; RV64I-NEXT: lhu s3, 24(a1) ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: sh s4, 0(s3) -; RV64I-NEXT: sh s0, 2(s3) -; RV64I-NEXT: sh s1, 4(s3) -; RV64I-NEXT: sh a0, 6(s3) +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: li a1, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: sh s4, 0(s0) +; RV64I-NEXT: sh s1, 2(s0) +; RV64I-NEXT: sh s2, 4(s0) +; RV64I-NEXT: sh a0, 6(s0) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -285,28 +281,28 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_urem_vec_2: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lui a2, %hi(.LCPI1_0) -; RV64IM-NEXT: ld a2, %lo(.LCPI1_0)(a2) -; RV64IM-NEXT: lhu a3, 0(a1) -; RV64IM-NEXT: lhu a4, 8(a1) -; RV64IM-NEXT: lhu a5, 16(a1) +; RV64IM-NEXT: lhu a2, 0(a1) +; RV64IM-NEXT: lhu a3, 8(a1) +; RV64IM-NEXT: lhu a4, 16(a1) ; RV64IM-NEXT: lhu a1, 24(a1) +; RV64IM-NEXT: lui a5, %hi(.LCPI1_0) ; RV64IM-NEXT: li a6, 95 -; RV64IM-NEXT: mulhu a7, a3, a2 -; RV64IM-NEXT: mulhu t0, a4, a2 -; RV64IM-NEXT: mulhu t1, a5, a2 -; RV64IM-NEXT: mulhu a2, a1, a2 +; RV64IM-NEXT: ld a5, %lo(.LCPI1_0)(a5) +; RV64IM-NEXT: mulhu a7, a2, a5 +; RV64IM-NEXT: mulhu t0, a3, a5 +; RV64IM-NEXT: mulhu t1, a4, a5 +; RV64IM-NEXT: mulhu a5, a1, a5 ; RV64IM-NEXT: mul a7, a7, a6 ; RV64IM-NEXT: mul t0, t0, a6 ; RV64IM-NEXT: mul t1, t1, a6 -; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: subw a3, a3, a7 -; RV64IM-NEXT: subw a4, a4, t0 -; RV64IM-NEXT: subw a5, a5, t1 -; RV64IM-NEXT: subw a1, a1, a2 -; RV64IM-NEXT: sh a3, 0(a0) -; RV64IM-NEXT: sh a4, 2(a0) -; RV64IM-NEXT: sh a5, 4(a0) +; RV64IM-NEXT: mul a5, a5, a6 +; RV64IM-NEXT: subw a2, a2, a7 +; RV64IM-NEXT: subw a3, a3, t0 +; RV64IM-NEXT: subw a4, a4, t1 +; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a4, 4(a0) ; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, @@ -329,11 +325,11 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lhu s1, 0(a1) ; RV32I-NEXT: lhu s2, 4(a1) ; RV32I-NEXT: lhu s3, 8(a1) ; RV32I-NEXT: lhu s4, 12(a1) -; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a1, 95 ; RV32I-NEXT: mv a0, s4 ; RV32I-NEXT: call __umodsi3 @@ -430,11 +426,11 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s6, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s7, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s8, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lhu s1, 0(a1) ; RV64I-NEXT: lhu s2, 8(a1) ; RV64I-NEXT: lhu s3, 16(a1) ; RV64I-NEXT: lhu s4, 24(a1) -; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: mv a0, s4 ; RV64I-NEXT: call __umoddi3 @@ -489,33 +485,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: combine_urem_udiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a2, 16(a1) -; RV64IM-NEXT: lhu a3, 24(a1) -; RV64IM-NEXT: lui a4, %hi(.LCPI2_0) -; RV64IM-NEXT: ld a4, %lo(.LCPI2_0)(a4) -; RV64IM-NEXT: lhu a5, 0(a1) -; RV64IM-NEXT: lhu a1, 8(a1) +; RV64IM-NEXT: lhu a2, 0(a1) +; RV64IM-NEXT: lhu a3, 8(a1) +; RV64IM-NEXT: lhu a4, 16(a1) +; RV64IM-NEXT: lhu a1, 24(a1) +; RV64IM-NEXT: lui a5, %hi(.LCPI2_0) ; RV64IM-NEXT: li a6, 95 -; RV64IM-NEXT: mulhu a7, a3, a4 -; RV64IM-NEXT: mulhu t0, a2, a4 -; RV64IM-NEXT: mulhu t1, a1, a4 -; RV64IM-NEXT: mulhu a4, a5, a4 +; RV64IM-NEXT: ld a5, %lo(.LCPI2_0)(a5) +; RV64IM-NEXT: mulhu a7, a1, a5 +; RV64IM-NEXT: mulhu t0, a4, a5 +; RV64IM-NEXT: mulhu t1, a3, a5 +; RV64IM-NEXT: mulhu a5, a2, a5 ; RV64IM-NEXT: mul t2, a7, a6 ; RV64IM-NEXT: mul t3, t0, a6 ; RV64IM-NEXT: mul t4, t1, a6 -; RV64IM-NEXT: mul a6, a4, a6 -; RV64IM-NEXT: add a4, a5, a4 -; RV64IM-NEXT: add a1, a1, t1 -; RV64IM-NEXT: add a2, a2, t0 -; RV64IM-NEXT: add a3, a3, a7 -; RV64IM-NEXT: subw a4, a4, a6 -; RV64IM-NEXT: subw a1, a1, t4 -; RV64IM-NEXT: subw a2, a2, t3 -; RV64IM-NEXT: subw a3, a3, t2 -; RV64IM-NEXT: sh a4, 0(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a2, 4(a0) -; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: mul a6, a5, a6 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: add a3, a3, t1 +; RV64IM-NEXT: add a4, a4, t0 +; RV64IM-NEXT: add a1, a1, a7 +; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: subw a3, a3, t4 +; RV64IM-NEXT: subw a4, a4, t3 +; RV64IM-NEXT: subw a1, a1, t2 +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a4, 4(a0) +; RV64IM-NEXT: sh a1, 6(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, @@ -533,13 +529,12 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lhu s1, 0(a1) ; RV32I-NEXT: lhu s2, 4(a1) ; RV32I-NEXT: lhu s3, 8(a1) -; RV32I-NEXT: lhu a2, 12(a1) -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a0, 12(a1) ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: andi a1, s1, 63 ; RV32I-NEXT: andi a2, s2, 31 @@ -585,13 +580,12 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: lhu s1, 0(a1) ; RV64I-NEXT: lhu s2, 8(a1) ; RV64I-NEXT: lhu s3, 16(a1) -; RV64I-NEXT: lhu a2, 24(a1) -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a0, 24(a1) ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: andi a1, s1, 63 ; RV64I-NEXT: andi a2, s2, 31 @@ -642,26 +636,25 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lhu a2, 4(a1) -; RV32I-NEXT: lhu s0, 8(a1) -; RV32I-NEXT: lhu s1, 12(a1) -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a0, 4(a1) +; RV32I-NEXT: lhu s1, 8(a1) +; RV32I-NEXT: lhu s2, 12(a1) ; RV32I-NEXT: li a1, 654 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: li a1, 23 -; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a1, a0, 1327 -; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __umodsi3 -; RV32I-NEXT: sh zero, 0(s2) -; RV32I-NEXT: sh s3, 2(s2) -; RV32I-NEXT: sh s0, 4(s2) -; RV32I-NEXT: sh a0, 6(s2) +; RV32I-NEXT: sh zero, 0(s0) +; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh a0, 6(s0) ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload @@ -708,26 +701,25 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lhu a2, 8(a1) -; RV64I-NEXT: lhu s0, 16(a1) -; RV64I-NEXT: lhu s1, 24(a1) -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a0, 8(a1) +; RV64I-NEXT: lhu s1, 16(a1) +; RV64I-NEXT: lhu s2, 24(a1) ; RV64I-NEXT: li a1, 654 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: sh zero, 0(s2) -; RV64I-NEXT: sh s3, 2(s2) -; RV64I-NEXT: sh s0, 4(s2) -; RV64I-NEXT: sh a0, 6(s2) +; RV64I-NEXT: sh zero, 0(s0) +; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh a0, 6(s0) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -743,17 +735,17 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; RV64IM-NEXT: lhu a1, 24(a1) ; RV64IM-NEXT: lui a4, %hi(.LCPI4_0) ; RV64IM-NEXT: li a5, 654 -; RV64IM-NEXT: ld a4, %lo(.LCPI4_0)(a4) ; RV64IM-NEXT: lui a6, %hi(.LCPI4_1) ; RV64IM-NEXT: li a7, 23 -; RV64IM-NEXT: ld a6, %lo(.LCPI4_1)(a6) +; RV64IM-NEXT: ld a4, %lo(.LCPI4_0)(a4) ; RV64IM-NEXT: mulhu a4, a2, a4 ; RV64IM-NEXT: mul a4, a4, a5 ; RV64IM-NEXT: lui a5, %hi(.LCPI4_2) -; RV64IM-NEXT: ld a5, %lo(.LCPI4_2)(a5) +; RV64IM-NEXT: ld a6, %lo(.LCPI4_1)(a6) ; RV64IM-NEXT: mulhu a6, a3, a6 ; RV64IM-NEXT: mul a6, a6, a7 ; RV64IM-NEXT: lui a7, 1 +; RV64IM-NEXT: ld a5, %lo(.LCPI4_2)(a5) ; RV64IM-NEXT: addi a7, a7, 1327 ; RV64IM-NEXT: mulhu a5, a1, a5 ; RV64IM-NEXT: mul a5, a5, a7 @@ -793,18 +785,17 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: lw s1, 16(a1) ; RV32I-NEXT: lw s2, 20(a1) ; RV32I-NEXT: lw s3, 24(a1) ; RV32I-NEXT: lw s4, 28(a1) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a4, 4(a1) +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw a3, 4(a1) ; RV32I-NEXT: lw s5, 8(a1) ; RV32I-NEXT: lw s6, 12(a1) -; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: mv a0, a3 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3 ; RV32I-NEXT: mv s7, a0 @@ -863,18 +854,17 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill +; RV32IM-NEXT: mv s0, a0 ; RV32IM-NEXT: lw s1, 16(a1) ; RV32IM-NEXT: lw s2, 20(a1) ; RV32IM-NEXT: lw s3, 24(a1) ; RV32IM-NEXT: lw s4, 28(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a4, 4(a1) +; RV32IM-NEXT: lw a0, 0(a1) +; RV32IM-NEXT: lw a3, 4(a1) ; RV32IM-NEXT: lw s5, 8(a1) ; RV32IM-NEXT: lw s6, 12(a1) -; RV32IM-NEXT: mv s0, a0 ; RV32IM-NEXT: li a2, 1 -; RV32IM-NEXT: mv a0, a3 -; RV32IM-NEXT: mv a1, a4 +; RV32IM-NEXT: mv a1, a3 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3 ; RV32IM-NEXT: mv s7, a0 @@ -928,26 +918,25 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV64I-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: ld a2, 8(a1) -; RV64I-NEXT: ld s0, 16(a1) -; RV64I-NEXT: ld s1, 24(a1) -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: ld a0, 8(a1) +; RV64I-NEXT: ld s1, 16(a1) +; RV64I-NEXT: ld s2, 24(a1) ; RV64I-NEXT: li a1, 654 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 -; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a1, a0, 1327 -; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __umoddi3 -; RV64I-NEXT: sd zero, 0(s2) -; RV64I-NEXT: sd s3, 8(s2) -; RV64I-NEXT: sd s0, 16(s2) -; RV64I-NEXT: sd a0, 24(s2) +; RV64I-NEXT: sd zero, 0(s0) +; RV64I-NEXT: sd s3, 8(s0) +; RV64I-NEXT: sd s1, 16(s0) +; RV64I-NEXT: sd a0, 24(s0) ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -961,31 +950,31 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV64IM-NEXT: ld a2, 8(a1) ; RV64IM-NEXT: ld a3, 16(a1) ; RV64IM-NEXT: ld a1, 24(a1) -; RV64IM-NEXT: lui a4, %hi(.LCPI6_1) -; RV64IM-NEXT: ld a4, %lo(.LCPI6_1)(a4) -; RV64IM-NEXT: lui a5, %hi(.LCPI6_0) +; RV64IM-NEXT: lui a4, %hi(.LCPI6_0) +; RV64IM-NEXT: lui a5, %hi(.LCPI6_1) ; RV64IM-NEXT: li a6, 654 +; RV64IM-NEXT: ld a5, %lo(.LCPI6_1)(a5) ; RV64IM-NEXT: srli a7, a2, 1 -; RV64IM-NEXT: mulhu a4, a7, a4 +; RV64IM-NEXT: mulhu a5, a7, a5 ; RV64IM-NEXT: lui a7, %hi(.LCPI6_2) -; RV64IM-NEXT: ld a5, %lo(.LCPI6_0)(a5) -; RV64IM-NEXT: ld a7, %lo(.LCPI6_2)(a7) -; RV64IM-NEXT: srli a4, a4, 7 -; RV64IM-NEXT: mul a4, a4, a6 +; RV64IM-NEXT: srli a5, a5, 7 +; RV64IM-NEXT: mul a5, a5, a6 ; RV64IM-NEXT: lui a6, 1 +; RV64IM-NEXT: ld a4, %lo(.LCPI6_0)(a4) +; RV64IM-NEXT: ld a7, %lo(.LCPI6_2)(a7) ; RV64IM-NEXT: addiw a6, a6, 1327 -; RV64IM-NEXT: mulhu a5, a3, a5 +; RV64IM-NEXT: mulhu a4, a3, a4 ; RV64IM-NEXT: mulhu a7, a1, a7 ; RV64IM-NEXT: srli a7, a7, 12 ; RV64IM-NEXT: mul a6, a7, a6 -; RV64IM-NEXT: sub a7, a3, a5 +; RV64IM-NEXT: sub a7, a3, a4 ; RV64IM-NEXT: srli a7, a7, 1 -; RV64IM-NEXT: add a5, a7, a5 -; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: add a4, a7, a4 +; RV64IM-NEXT: sub a2, a2, a5 ; RV64IM-NEXT: sub a1, a1, a6 -; RV64IM-NEXT: li a4, 23 -; RV64IM-NEXT: srli a5, a5, 4 -; RV64IM-NEXT: mul a4, a5, a4 +; RV64IM-NEXT: li a5, 23 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: mul a4, a4, a5 ; RV64IM-NEXT: sub a3, a3, a4 ; RV64IM-NEXT: sd zero, 0(a0) ; RV64IM-NEXT: sd a2, 8(a0) diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll index 895d84b38be32..2d6434ebdb434 100644 --- a/llvm/test/CodeGen/RISCV/vararg.ll +++ b/llvm/test/CodeGen/RISCV/vararg.ll @@ -162,16 +162,16 @@ define i32 @va1(ptr %fmt, ...) { ; LP64-LP64F-LP64D-FPELIM: # %bb.0: ; LP64-LP64F-LP64D-FPELIM-NEXT: addi sp, sp, -80 ; LP64-LP64F-LP64D-FPELIM-NEXT: .cfi_def_cfa_offset 80 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a1, 24(sp) -; LP64-LP64F-LP64D-FPELIM-NEXT: addi a0, sp, 28 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a0, 8(sp) -; LP64-LP64F-LP64D-FPELIM-NEXT: lw a0, 24(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a5, 56(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a6, 64(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a7, 72(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a1, 24(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a2, 32(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a3, 40(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a4, 48(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: addi a0, sp, 28 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a0, 8(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: lw a0, 24(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: addi sp, sp, 80 ; LP64-LP64F-LP64D-FPELIM-NEXT: .cfi_def_cfa_offset 0 ; LP64-LP64F-LP64D-FPELIM-NEXT: ret @@ -186,16 +186,16 @@ define i32 @va1(ptr %fmt, ...) { ; LP64-LP64F-LP64D-WITHFP-NEXT: .cfi_offset s0, -80 ; LP64-LP64F-LP64D-WITHFP-NEXT: addi s0, sp, 32 ; LP64-LP64F-LP64D-WITHFP-NEXT: .cfi_def_cfa s0, 64 -; LP64-LP64F-LP64D-WITHFP-NEXT: sd a1, 8(s0) -; LP64-LP64F-LP64D-WITHFP-NEXT: addi a0, s0, 12 -; LP64-LP64F-LP64D-WITHFP-NEXT: sd a0, -24(s0) -; LP64-LP64F-LP64D-WITHFP-NEXT: lw a0, 8(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a5, 40(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a6, 48(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a7, 56(s0) +; LP64-LP64F-LP64D-WITHFP-NEXT: sd a1, 8(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a2, 16(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a3, 24(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a4, 32(s0) +; LP64-LP64F-LP64D-WITHFP-NEXT: addi a0, s0, 12 +; LP64-LP64F-LP64D-WITHFP-NEXT: sd a0, -24(s0) +; LP64-LP64F-LP64D-WITHFP-NEXT: lw a0, 8(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: .cfi_def_cfa sp, 96 ; LP64-LP64F-LP64D-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -209,14 +209,14 @@ define i32 @va1(ptr %fmt, ...) { ; LP64E-FPELIM: # %bb.0: ; LP64E-FPELIM-NEXT: addi sp, sp, -56 ; LP64E-FPELIM-NEXT: .cfi_def_cfa_offset 56 -; LP64E-FPELIM-NEXT: addi a0, sp, 20 -; LP64E-FPELIM-NEXT: sd a0, 0(sp) -; LP64E-FPELIM-NEXT: sd a1, 16(sp) -; LP64E-FPELIM-NEXT: lw a0, 16(sp) ; LP64E-FPELIM-NEXT: sd a5, 48(sp) +; LP64E-FPELIM-NEXT: sd a1, 16(sp) ; LP64E-FPELIM-NEXT: sd a2, 24(sp) ; LP64E-FPELIM-NEXT: sd a3, 32(sp) ; LP64E-FPELIM-NEXT: sd a4, 40(sp) +; LP64E-FPELIM-NEXT: addi a0, sp, 20 +; LP64E-FPELIM-NEXT: sd a0, 0(sp) +; LP64E-FPELIM-NEXT: lw a0, 16(sp) ; LP64E-FPELIM-NEXT: addi sp, sp, 56 ; LP64E-FPELIM-NEXT: .cfi_def_cfa_offset 0 ; LP64E-FPELIM-NEXT: ret @@ -231,14 +231,14 @@ define i32 @va1(ptr %fmt, ...) { ; LP64E-WITHFP-NEXT: .cfi_offset s0, -64 ; LP64E-WITHFP-NEXT: addi s0, sp, 24 ; LP64E-WITHFP-NEXT: .cfi_def_cfa s0, 48 -; LP64E-WITHFP-NEXT: addi a0, s0, 12 -; LP64E-WITHFP-NEXT: sd a0, -24(s0) -; LP64E-WITHFP-NEXT: sd a1, 8(s0) -; LP64E-WITHFP-NEXT: lw a0, 8(s0) ; LP64E-WITHFP-NEXT: sd a5, 40(s0) +; LP64E-WITHFP-NEXT: sd a1, 8(s0) ; LP64E-WITHFP-NEXT: sd a2, 16(s0) ; LP64E-WITHFP-NEXT: sd a3, 24(s0) ; LP64E-WITHFP-NEXT: sd a4, 32(s0) +; LP64E-WITHFP-NEXT: addi a0, s0, 12 +; LP64E-WITHFP-NEXT: sd a0, -24(s0) +; LP64E-WITHFP-NEXT: lw a0, 8(s0) ; LP64E-WITHFP-NEXT: .cfi_def_cfa sp, 72 ; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload ; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload @@ -1348,10 +1348,10 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32-ILP32F-FPELIM-NEXT: sw a0, 4(sp) ; ILP32-ILP32F-FPELIM-NEXT: andi a3, a3, -8 ; ILP32-ILP32F-FPELIM-NEXT: sw a4, 4(sp) -; ILP32-ILP32F-FPELIM-NEXT: lw a0, 4(a3) -; ILP32-ILP32F-FPELIM-NEXT: lw a3, 0(a3) -; ILP32-ILP32F-FPELIM-NEXT: add a2, a2, a0 -; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a3 +; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a3) +; ILP32-ILP32F-FPELIM-NEXT: lw a3, 4(a3) +; ILP32-ILP32F-FPELIM-NEXT: add a2, a2, a3 +; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a0 ; ILP32-ILP32F-FPELIM-NEXT: sltu a1, a0, a1 ; ILP32-ILP32F-FPELIM-NEXT: add a1, a2, a1 ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 32 @@ -1374,10 +1374,10 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32-ILP32F-WITHFP-NEXT: sw a0, -12(s0) ; ILP32-ILP32F-WITHFP-NEXT: andi a3, a3, -8 ; ILP32-ILP32F-WITHFP-NEXT: sw a4, -12(s0) -; ILP32-ILP32F-WITHFP-NEXT: lw a0, 4(a3) -; ILP32-ILP32F-WITHFP-NEXT: lw a3, 0(a3) -; ILP32-ILP32F-WITHFP-NEXT: add a2, a2, a0 -; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a3 +; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a3) +; ILP32-ILP32F-WITHFP-NEXT: lw a3, 4(a3) +; ILP32-ILP32F-WITHFP-NEXT: add a2, a2, a3 +; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a0 ; ILP32-ILP32F-WITHFP-NEXT: sltu a1, a0, a1 ; ILP32-ILP32F-WITHFP-NEXT: add a1, a2, a1 ; ILP32-ILP32F-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload @@ -1399,10 +1399,10 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a0, 4(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a3, a3, -8 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a4, 4(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 4(a3) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 0(a3) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, a2, a0 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a3 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 0(a3) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 4(a3) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, a2, a3 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a0 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sltu a1, a0, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a2, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 32 @@ -1420,10 +1420,10 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32E-FPELIM-NEXT: sw a0, 0(sp) ; ILP32E-FPELIM-NEXT: andi a3, a3, -8 ; ILP32E-FPELIM-NEXT: sw a4, 0(sp) -; ILP32E-FPELIM-NEXT: lw a0, 4(a3) -; ILP32E-FPELIM-NEXT: lw a3, 0(a3) -; ILP32E-FPELIM-NEXT: add a2, a2, a0 -; ILP32E-FPELIM-NEXT: add a0, a1, a3 +; ILP32E-FPELIM-NEXT: lw a0, 0(a3) +; ILP32E-FPELIM-NEXT: lw a3, 4(a3) +; ILP32E-FPELIM-NEXT: add a2, a2, a3 +; ILP32E-FPELIM-NEXT: add a0, a1, a0 ; ILP32E-FPELIM-NEXT: sltu a1, a0, a1 ; ILP32E-FPELIM-NEXT: add a1, a2, a1 ; ILP32E-FPELIM-NEXT: addi sp, sp, 20 @@ -1444,10 +1444,10 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; ILP32E-WITHFP-NEXT: sw a0, -12(s0) ; ILP32E-WITHFP-NEXT: andi a3, a3, -8 ; ILP32E-WITHFP-NEXT: sw a4, -12(s0) -; ILP32E-WITHFP-NEXT: lw a0, 4(a3) -; ILP32E-WITHFP-NEXT: lw a3, 0(a3) -; ILP32E-WITHFP-NEXT: add a2, a2, a0 -; ILP32E-WITHFP-NEXT: add a0, a1, a3 +; ILP32E-WITHFP-NEXT: lw a0, 0(a3) +; ILP32E-WITHFP-NEXT: lw a3, 4(a3) +; ILP32E-WITHFP-NEXT: add a2, a2, a3 +; ILP32E-WITHFP-NEXT: add a0, a1, a0 ; ILP32E-WITHFP-NEXT: sltu a1, a0, a1 ; ILP32E-WITHFP-NEXT: add a1, a2, a1 ; ILP32E-WITHFP-NEXT: lw ra, 8(sp) # 4-byte Folded Reload @@ -1464,9 +1464,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a3, 24(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a4, 32(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a5, 40(sp) -; LP64-LP64F-LP64D-FPELIM-NEXT: addi a3, sp, 31 +; LP64-LP64F-LP64D-FPELIM-NEXT: addi a0, sp, 31 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a0, 8(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, a1, a2 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a3, 8(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: addi sp, sp, 64 ; LP64-LP64F-LP64D-FPELIM-NEXT: ret ; @@ -1482,9 +1482,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a3, 8(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a4, 16(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a5, 24(s0) -; LP64-LP64F-LP64D-WITHFP-NEXT: addi a3, s0, 15 +; LP64-LP64F-LP64D-WITHFP-NEXT: addi a0, s0, 15 +; LP64-LP64F-LP64D-WITHFP-NEXT: sd a0, -24(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: add a0, a1, a2 -; LP64-LP64F-LP64D-WITHFP-NEXT: sd a3, -24(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-WITHFP-NEXT: addi sp, sp, 80 @@ -1497,9 +1497,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; LP64E-FPELIM-NEXT: sd a3, 16(sp) ; LP64E-FPELIM-NEXT: sd a4, 24(sp) ; LP64E-FPELIM-NEXT: sd a5, 32(sp) -; LP64E-FPELIM-NEXT: addi a3, sp, 23 +; LP64E-FPELIM-NEXT: addi a0, sp, 23 +; LP64E-FPELIM-NEXT: sd a0, 0(sp) ; LP64E-FPELIM-NEXT: add a0, a1, a2 -; LP64E-FPELIM-NEXT: sd a3, 0(sp) ; LP64E-FPELIM-NEXT: addi sp, sp, 40 ; LP64E-FPELIM-NEXT: ret ; @@ -1513,9 +1513,9 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind { ; LP64E-WITHFP-NEXT: sd a3, 8(s0) ; LP64E-WITHFP-NEXT: sd a4, 16(s0) ; LP64E-WITHFP-NEXT: sd a5, 24(s0) -; LP64E-WITHFP-NEXT: addi a3, s0, 15 +; LP64E-WITHFP-NEXT: addi a0, s0, 15 +; LP64E-WITHFP-NEXT: sd a0, -24(s0) ; LP64E-WITHFP-NEXT: add a0, a1, a2 -; LP64E-WITHFP-NEXT: sd a3, -24(s0) ; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload ; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; LP64E-WITHFP-NEXT: addi sp, sp, 56 @@ -1603,10 +1603,10 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 20(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: fld fa5, 0(a0) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: fsd fa5, 8(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 12(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 8(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, a2, a0 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a3 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 8(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 12(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, a2, a3 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a0 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sltu a1, a0, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a2, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 48 @@ -1668,9 +1668,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a3, 24(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a4, 32(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a5, 40(sp) -; LP64-LP64F-LP64D-FPELIM-NEXT: addi a3, sp, 24 +; LP64-LP64F-LP64D-FPELIM-NEXT: addi a0, sp, 24 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a0, 8(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, a1, a2 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a3, 8(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: addi sp, sp, 64 ; LP64-LP64F-LP64D-FPELIM-NEXT: ret ; @@ -1686,9 +1686,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a3, 8(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a4, 16(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a5, 24(s0) -; LP64-LP64F-LP64D-WITHFP-NEXT: addi a3, s0, 8 +; LP64-LP64F-LP64D-WITHFP-NEXT: addi a0, s0, 8 +; LP64-LP64F-LP64D-WITHFP-NEXT: sd a0, -24(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: add a0, a1, a2 -; LP64-LP64F-LP64D-WITHFP-NEXT: sd a3, -24(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-WITHFP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-WITHFP-NEXT: addi sp, sp, 80 @@ -1701,9 +1701,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; LP64E-FPELIM-NEXT: sd a3, 16(sp) ; LP64E-FPELIM-NEXT: sd a4, 24(sp) ; LP64E-FPELIM-NEXT: sd a5, 32(sp) -; LP64E-FPELIM-NEXT: addi a3, sp, 16 +; LP64E-FPELIM-NEXT: addi a0, sp, 16 +; LP64E-FPELIM-NEXT: sd a0, 0(sp) ; LP64E-FPELIM-NEXT: add a0, a1, a2 -; LP64E-FPELIM-NEXT: sd a3, 0(sp) ; LP64E-FPELIM-NEXT: addi sp, sp, 40 ; LP64E-FPELIM-NEXT: ret ; @@ -1717,9 +1717,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind { ; LP64E-WITHFP-NEXT: sd a3, 8(s0) ; LP64E-WITHFP-NEXT: sd a4, 16(s0) ; LP64E-WITHFP-NEXT: sd a5, 24(s0) -; LP64E-WITHFP-NEXT: addi a3, s0, 8 +; LP64E-WITHFP-NEXT: addi a0, s0, 8 +; LP64E-WITHFP-NEXT: sd a0, -24(s0) ; LP64E-WITHFP-NEXT: add a0, a1, a2 -; LP64E-WITHFP-NEXT: sd a3, -24(s0) ; LP64E-WITHFP-NEXT: ld ra, 16(sp) # 8-byte Folded Reload ; LP64E-WITHFP-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; LP64E-WITHFP-NEXT: addi sp, sp, 56 @@ -2275,40 +2275,40 @@ define void @va5_aligned_stack_caller() nounwind { ; ILP32-ILP32F-FPELIM: # %bb.0: ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, -64 ; ILP32-ILP32F-FPELIM-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; ILP32-ILP32F-FPELIM-NEXT: li a4, 17 -; ILP32-ILP32F-FPELIM-NEXT: li a5, 16 -; ILP32-ILP32F-FPELIM-NEXT: li a6, 15 -; ILP32-ILP32F-FPELIM-NEXT: lui a7, 262236 -; ILP32-ILP32F-FPELIM-NEXT: lui t0, 377487 -; ILP32-ILP32F-FPELIM-NEXT: li t1, 14 -; ILP32-ILP32F-FPELIM-NEXT: lui t2, 262153 -; ILP32-ILP32F-FPELIM-NEXT: lui t3, 545260 -; ILP32-ILP32F-FPELIM-NEXT: lui t4, 964690 -; ILP32-ILP32F-FPELIM-NEXT: lui t5, 335544 -; ILP32-ILP32F-FPELIM-NEXT: lui t6, 688509 +; ILP32-ILP32F-FPELIM-NEXT: li a3, 17 +; ILP32-ILP32F-FPELIM-NEXT: li a4, 16 +; ILP32-ILP32F-FPELIM-NEXT: li a5, 15 +; ILP32-ILP32F-FPELIM-NEXT: lui a6, 262236 +; ILP32-ILP32F-FPELIM-NEXT: lui a7, 377487 +; ILP32-ILP32F-FPELIM-NEXT: li t0, 14 +; ILP32-ILP32F-FPELIM-NEXT: lui t1, 262153 +; ILP32-ILP32F-FPELIM-NEXT: lui t2, 545260 +; ILP32-ILP32F-FPELIM-NEXT: lui t3, 964690 +; ILP32-ILP32F-FPELIM-NEXT: lui t4, 335544 +; ILP32-ILP32F-FPELIM-NEXT: lui t5, 688509 ; ILP32-ILP32F-FPELIM-NEXT: li a0, 1 ; ILP32-ILP32F-FPELIM-NEXT: li a1, 11 ; ILP32-ILP32F-FPELIM-NEXT: addi a2, sp, 32 +; ILP32-ILP32F-FPELIM-NEXT: sw a4, 20(sp) +; ILP32-ILP32F-FPELIM-NEXT: sw a3, 24(sp) ; ILP32-ILP32F-FPELIM-NEXT: li a3, 12 -; ILP32-ILP32F-FPELIM-NEXT: sw a5, 20(sp) -; ILP32-ILP32F-FPELIM-NEXT: sw a4, 24(sp) +; ILP32-ILP32F-FPELIM-NEXT: addi a4, a6, 655 +; ILP32-ILP32F-FPELIM-NEXT: addi a6, a7, 1475 +; ILP32-ILP32F-FPELIM-NEXT: sw t0, 0(sp) +; ILP32-ILP32F-FPELIM-NEXT: sw a6, 8(sp) +; ILP32-ILP32F-FPELIM-NEXT: sw a4, 12(sp) +; ILP32-ILP32F-FPELIM-NEXT: sw a5, 16(sp) ; ILP32-ILP32F-FPELIM-NEXT: li a4, 13 -; ILP32-ILP32F-FPELIM-NEXT: addi a5, a7, 655 -; ILP32-ILP32F-FPELIM-NEXT: addi a7, t0, 1475 -; ILP32-ILP32F-FPELIM-NEXT: sw t1, 0(sp) -; ILP32-ILP32F-FPELIM-NEXT: sw a7, 8(sp) -; ILP32-ILP32F-FPELIM-NEXT: sw a5, 12(sp) -; ILP32-ILP32F-FPELIM-NEXT: sw a6, 16(sp) -; ILP32-ILP32F-FPELIM-NEXT: li a7, 4 -; ILP32-ILP32F-FPELIM-NEXT: addi a5, t2, 491 -; ILP32-ILP32F-FPELIM-NEXT: addi t0, t3, -1967 -; ILP32-ILP32F-FPELIM-NEXT: addi t1, t4, -328 -; ILP32-ILP32F-FPELIM-NEXT: addi t2, t5, 1311 -; ILP32-ILP32F-FPELIM-NEXT: addi a6, t6, -2048 -; ILP32-ILP32F-FPELIM-NEXT: sw t2, 32(sp) -; ILP32-ILP32F-FPELIM-NEXT: sw t1, 36(sp) -; ILP32-ILP32F-FPELIM-NEXT: sw t0, 40(sp) +; ILP32-ILP32F-FPELIM-NEXT: addi a5, t1, 491 +; ILP32-ILP32F-FPELIM-NEXT: addi a7, t2, -1967 +; ILP32-ILP32F-FPELIM-NEXT: addi t0, t3, -328 +; ILP32-ILP32F-FPELIM-NEXT: addi t1, t4, 1311 +; ILP32-ILP32F-FPELIM-NEXT: addi a6, t5, -2048 +; ILP32-ILP32F-FPELIM-NEXT: sw t1, 32(sp) +; ILP32-ILP32F-FPELIM-NEXT: sw t0, 36(sp) +; ILP32-ILP32F-FPELIM-NEXT: sw a7, 40(sp) ; ILP32-ILP32F-FPELIM-NEXT: sw a5, 44(sp) +; ILP32-ILP32F-FPELIM-NEXT: li a7, 4 ; ILP32-ILP32F-FPELIM-NEXT: call va5_aligned_stack_callee ; ILP32-ILP32F-FPELIM-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 64 @@ -2320,40 +2320,40 @@ define void @va5_aligned_stack_caller() nounwind { ; ILP32-ILP32F-WITHFP-NEXT: sw ra, 60(sp) # 4-byte Folded Spill ; ILP32-ILP32F-WITHFP-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; ILP32-ILP32F-WITHFP-NEXT: addi s0, sp, 64 -; ILP32-ILP32F-WITHFP-NEXT: li a4, 17 -; ILP32-ILP32F-WITHFP-NEXT: li a5, 16 -; ILP32-ILP32F-WITHFP-NEXT: li a6, 15 -; ILP32-ILP32F-WITHFP-NEXT: lui a7, 262236 -; ILP32-ILP32F-WITHFP-NEXT: lui t0, 377487 -; ILP32-ILP32F-WITHFP-NEXT: li t1, 14 -; ILP32-ILP32F-WITHFP-NEXT: lui t2, 262153 -; ILP32-ILP32F-WITHFP-NEXT: lui t3, 545260 -; ILP32-ILP32F-WITHFP-NEXT: lui t4, 964690 -; ILP32-ILP32F-WITHFP-NEXT: lui t5, 335544 -; ILP32-ILP32F-WITHFP-NEXT: lui t6, 688509 +; ILP32-ILP32F-WITHFP-NEXT: li a3, 17 +; ILP32-ILP32F-WITHFP-NEXT: li a4, 16 +; ILP32-ILP32F-WITHFP-NEXT: li a5, 15 +; ILP32-ILP32F-WITHFP-NEXT: lui a6, 262236 +; ILP32-ILP32F-WITHFP-NEXT: lui a7, 377487 +; ILP32-ILP32F-WITHFP-NEXT: li t0, 14 +; ILP32-ILP32F-WITHFP-NEXT: lui t1, 262153 +; ILP32-ILP32F-WITHFP-NEXT: lui t2, 545260 +; ILP32-ILP32F-WITHFP-NEXT: lui t3, 964690 +; ILP32-ILP32F-WITHFP-NEXT: lui t4, 335544 +; ILP32-ILP32F-WITHFP-NEXT: lui t5, 688509 ; ILP32-ILP32F-WITHFP-NEXT: li a0, 1 ; ILP32-ILP32F-WITHFP-NEXT: li a1, 11 ; ILP32-ILP32F-WITHFP-NEXT: addi a2, s0, -32 +; ILP32-ILP32F-WITHFP-NEXT: sw a4, 20(sp) +; ILP32-ILP32F-WITHFP-NEXT: sw a3, 24(sp) ; ILP32-ILP32F-WITHFP-NEXT: li a3, 12 -; ILP32-ILP32F-WITHFP-NEXT: sw a5, 20(sp) -; ILP32-ILP32F-WITHFP-NEXT: sw a4, 24(sp) +; ILP32-ILP32F-WITHFP-NEXT: addi a4, a6, 655 +; ILP32-ILP32F-WITHFP-NEXT: addi a6, a7, 1475 +; ILP32-ILP32F-WITHFP-NEXT: sw t0, 0(sp) +; ILP32-ILP32F-WITHFP-NEXT: sw a6, 8(sp) +; ILP32-ILP32F-WITHFP-NEXT: sw a4, 12(sp) +; ILP32-ILP32F-WITHFP-NEXT: sw a5, 16(sp) ; ILP32-ILP32F-WITHFP-NEXT: li a4, 13 -; ILP32-ILP32F-WITHFP-NEXT: addi a5, a7, 655 -; ILP32-ILP32F-WITHFP-NEXT: addi a7, t0, 1475 -; ILP32-ILP32F-WITHFP-NEXT: sw t1, 0(sp) -; ILP32-ILP32F-WITHFP-NEXT: sw a7, 8(sp) -; ILP32-ILP32F-WITHFP-NEXT: sw a5, 12(sp) -; ILP32-ILP32F-WITHFP-NEXT: sw a6, 16(sp) -; ILP32-ILP32F-WITHFP-NEXT: li a7, 4 -; ILP32-ILP32F-WITHFP-NEXT: addi a5, t2, 491 -; ILP32-ILP32F-WITHFP-NEXT: addi t0, t3, -1967 -; ILP32-ILP32F-WITHFP-NEXT: addi t1, t4, -328 -; ILP32-ILP32F-WITHFP-NEXT: addi t2, t5, 1311 -; ILP32-ILP32F-WITHFP-NEXT: addi a6, t6, -2048 -; ILP32-ILP32F-WITHFP-NEXT: sw t2, -32(s0) -; ILP32-ILP32F-WITHFP-NEXT: sw t1, -28(s0) -; ILP32-ILP32F-WITHFP-NEXT: sw t0, -24(s0) +; ILP32-ILP32F-WITHFP-NEXT: addi a5, t1, 491 +; ILP32-ILP32F-WITHFP-NEXT: addi a7, t2, -1967 +; ILP32-ILP32F-WITHFP-NEXT: addi t0, t3, -328 +; ILP32-ILP32F-WITHFP-NEXT: addi t1, t4, 1311 +; ILP32-ILP32F-WITHFP-NEXT: addi a6, t5, -2048 +; ILP32-ILP32F-WITHFP-NEXT: sw t1, -32(s0) +; ILP32-ILP32F-WITHFP-NEXT: sw t0, -28(s0) +; ILP32-ILP32F-WITHFP-NEXT: sw a7, -24(s0) ; ILP32-ILP32F-WITHFP-NEXT: sw a5, -20(s0) +; ILP32-ILP32F-WITHFP-NEXT: li a7, 4 ; ILP32-ILP32F-WITHFP-NEXT: call va5_aligned_stack_callee ; ILP32-ILP32F-WITHFP-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; ILP32-ILP32F-WITHFP-NEXT: lw s0, 56(sp) # 4-byte Folded Reload @@ -2364,40 +2364,40 @@ define void @va5_aligned_stack_caller() nounwind { ; RV32D-ILP32-ILP32F-ILP32D-FPELIM: # %bb.0: ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, -64 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a5, 262236 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a6, 377487 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a4, 17 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a7, 16 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li t0, 15 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li t1, 14 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t2, 262153 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t3, 545260 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t4, 964690 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t5, 335544 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t6, 688509 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a4, 262236 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a5, 377487 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a3, 17 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a6, 16 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a7, 15 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li t0, 14 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t1, 262153 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t2, 545260 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t3, 964690 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t4, 335544 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui t5, 688509 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a0, 1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a1, 11 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a2, sp, 32 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a6, 20(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 24(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a3, 12 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a7, 20(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a4, 24(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a4, a4, 655 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a5, a5, 1475 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t0, 0(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a5, 8(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a4, 12(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a7, 16(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a4, 13 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a5, a5, 655 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a6, a6, 1475 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t1, 0(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a6, 8(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a5, 12(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t0, 16(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a7, 4 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a5, t2, 491 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t0, t3, -1967 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t1, t4, -328 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t2, t5, 1311 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a6, t6, -2048 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t2, 32(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t1, 36(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t0, 40(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a5, t1, 491 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a7, t2, -1967 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t0, t3, -328 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi t1, t4, 1311 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a6, t5, -2048 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t1, 32(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw t0, 36(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a7, 40(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a5, 44(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: li a7, 4 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: call va5_aligned_stack_callee ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 64 @@ -2410,41 +2410,41 @@ define void @va5_aligned_stack_caller() nounwind { ; ILP32E-FPELIM-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; ILP32E-FPELIM-NEXT: addi s0, sp, 64 ; ILP32E-FPELIM-NEXT: andi sp, sp, -16 -; ILP32E-FPELIM-NEXT: li a3, 17 -; ILP32E-FPELIM-NEXT: li a4, 16 -; ILP32E-FPELIM-NEXT: li a5, 15 -; ILP32E-FPELIM-NEXT: lui a6, 262236 -; ILP32E-FPELIM-NEXT: lui a7, 377487 -; ILP32E-FPELIM-NEXT: li t0, 14 -; ILP32E-FPELIM-NEXT: li t1, 4 -; ILP32E-FPELIM-NEXT: lui t2, 262153 -; ILP32E-FPELIM-NEXT: lui t3, 545260 -; ILP32E-FPELIM-NEXT: lui t4, 964690 -; ILP32E-FPELIM-NEXT: lui t5, 335544 -; ILP32E-FPELIM-NEXT: lui t6, 688509 +; ILP32E-FPELIM-NEXT: li a2, 17 +; ILP32E-FPELIM-NEXT: li a3, 16 +; ILP32E-FPELIM-NEXT: li a4, 15 +; ILP32E-FPELIM-NEXT: lui a5, 262236 +; ILP32E-FPELIM-NEXT: lui a6, 377487 +; ILP32E-FPELIM-NEXT: li a7, 14 +; ILP32E-FPELIM-NEXT: li t0, 4 +; ILP32E-FPELIM-NEXT: lui t1, 262153 +; ILP32E-FPELIM-NEXT: lui t2, 545260 +; ILP32E-FPELIM-NEXT: lui t3, 964690 +; ILP32E-FPELIM-NEXT: lui t4, 335544 +; ILP32E-FPELIM-NEXT: lui t5, 688509 ; ILP32E-FPELIM-NEXT: li a0, 1 ; ILP32E-FPELIM-NEXT: li a1, 11 +; ILP32E-FPELIM-NEXT: sw a4, 16(sp) +; ILP32E-FPELIM-NEXT: sw a3, 20(sp) +; ILP32E-FPELIM-NEXT: sw a2, 24(sp) ; ILP32E-FPELIM-NEXT: addi a2, sp, 32 -; ILP32E-FPELIM-NEXT: sw a5, 16(sp) -; ILP32E-FPELIM-NEXT: sw a4, 20(sp) -; ILP32E-FPELIM-NEXT: sw a3, 24(sp) +; ILP32E-FPELIM-NEXT: addi a3, a5, 655 +; ILP32E-FPELIM-NEXT: addi a4, a6, 1475 +; ILP32E-FPELIM-NEXT: sw t0, 0(sp) +; ILP32E-FPELIM-NEXT: sw a7, 4(sp) +; ILP32E-FPELIM-NEXT: sw a4, 8(sp) +; ILP32E-FPELIM-NEXT: sw a3, 12(sp) ; ILP32E-FPELIM-NEXT: li a3, 12 -; ILP32E-FPELIM-NEXT: addi a4, a6, 655 -; ILP32E-FPELIM-NEXT: addi a5, a7, 1475 -; ILP32E-FPELIM-NEXT: sw t1, 0(sp) -; ILP32E-FPELIM-NEXT: sw t0, 4(sp) -; ILP32E-FPELIM-NEXT: sw a5, 8(sp) -; ILP32E-FPELIM-NEXT: sw a4, 12(sp) +; ILP32E-FPELIM-NEXT: addi a4, t1, 491 +; ILP32E-FPELIM-NEXT: addi a6, t2, -1967 +; ILP32E-FPELIM-NEXT: addi a7, t3, -328 +; ILP32E-FPELIM-NEXT: addi t0, t4, 1311 +; ILP32E-FPELIM-NEXT: addi a5, t5, -2048 +; ILP32E-FPELIM-NEXT: sw t0, 32(sp) +; ILP32E-FPELIM-NEXT: sw a7, 36(sp) +; ILP32E-FPELIM-NEXT: sw a6, 40(sp) +; ILP32E-FPELIM-NEXT: sw a4, 44(sp) ; ILP32E-FPELIM-NEXT: li a4, 13 -; ILP32E-FPELIM-NEXT: addi a6, t2, 491 -; ILP32E-FPELIM-NEXT: addi a7, t3, -1967 -; ILP32E-FPELIM-NEXT: addi t0, t4, -328 -; ILP32E-FPELIM-NEXT: addi t1, t5, 1311 -; ILP32E-FPELIM-NEXT: addi a5, t6, -2048 -; ILP32E-FPELIM-NEXT: sw t1, 32(sp) -; ILP32E-FPELIM-NEXT: sw t0, 36(sp) -; ILP32E-FPELIM-NEXT: sw a7, 40(sp) -; ILP32E-FPELIM-NEXT: sw a6, 44(sp) ; ILP32E-FPELIM-NEXT: call va5_aligned_stack_callee ; ILP32E-FPELIM-NEXT: addi sp, s0, -64 ; ILP32E-FPELIM-NEXT: lw ra, 60(sp) # 4-byte Folded Reload @@ -2459,41 +2459,41 @@ define void @va5_aligned_stack_caller() nounwind { ; ILP32E-WITHFP-NEXT: sw s0, 56(sp) # 4-byte Folded Spill ; ILP32E-WITHFP-NEXT: addi s0, sp, 64 ; ILP32E-WITHFP-NEXT: andi sp, sp, -16 -; ILP32E-WITHFP-NEXT: li a3, 17 -; ILP32E-WITHFP-NEXT: li a4, 16 -; ILP32E-WITHFP-NEXT: li a5, 15 -; ILP32E-WITHFP-NEXT: lui a6, 262236 -; ILP32E-WITHFP-NEXT: lui a7, 377487 -; ILP32E-WITHFP-NEXT: li t0, 14 -; ILP32E-WITHFP-NEXT: li t1, 4 -; ILP32E-WITHFP-NEXT: lui t2, 262153 -; ILP32E-WITHFP-NEXT: lui t3, 545260 -; ILP32E-WITHFP-NEXT: lui t4, 964690 -; ILP32E-WITHFP-NEXT: lui t5, 335544 -; ILP32E-WITHFP-NEXT: lui t6, 688509 +; ILP32E-WITHFP-NEXT: li a2, 17 +; ILP32E-WITHFP-NEXT: li a3, 16 +; ILP32E-WITHFP-NEXT: li a4, 15 +; ILP32E-WITHFP-NEXT: lui a5, 262236 +; ILP32E-WITHFP-NEXT: lui a6, 377487 +; ILP32E-WITHFP-NEXT: li a7, 14 +; ILP32E-WITHFP-NEXT: li t0, 4 +; ILP32E-WITHFP-NEXT: lui t1, 262153 +; ILP32E-WITHFP-NEXT: lui t2, 545260 +; ILP32E-WITHFP-NEXT: lui t3, 964690 +; ILP32E-WITHFP-NEXT: lui t4, 335544 +; ILP32E-WITHFP-NEXT: lui t5, 688509 ; ILP32E-WITHFP-NEXT: li a0, 1 ; ILP32E-WITHFP-NEXT: li a1, 11 +; ILP32E-WITHFP-NEXT: sw a4, 16(sp) +; ILP32E-WITHFP-NEXT: sw a3, 20(sp) +; ILP32E-WITHFP-NEXT: sw a2, 24(sp) ; ILP32E-WITHFP-NEXT: addi a2, sp, 32 -; ILP32E-WITHFP-NEXT: sw a5, 16(sp) -; ILP32E-WITHFP-NEXT: sw a4, 20(sp) -; ILP32E-WITHFP-NEXT: sw a3, 24(sp) +; ILP32E-WITHFP-NEXT: addi a3, a5, 655 +; ILP32E-WITHFP-NEXT: addi a4, a6, 1475 +; ILP32E-WITHFP-NEXT: sw t0, 0(sp) +; ILP32E-WITHFP-NEXT: sw a7, 4(sp) +; ILP32E-WITHFP-NEXT: sw a4, 8(sp) +; ILP32E-WITHFP-NEXT: sw a3, 12(sp) ; ILP32E-WITHFP-NEXT: li a3, 12 -; ILP32E-WITHFP-NEXT: addi a4, a6, 655 -; ILP32E-WITHFP-NEXT: addi a5, a7, 1475 -; ILP32E-WITHFP-NEXT: sw t1, 0(sp) -; ILP32E-WITHFP-NEXT: sw t0, 4(sp) -; ILP32E-WITHFP-NEXT: sw a5, 8(sp) -; ILP32E-WITHFP-NEXT: sw a4, 12(sp) +; ILP32E-WITHFP-NEXT: addi a4, t1, 491 +; ILP32E-WITHFP-NEXT: addi a6, t2, -1967 +; ILP32E-WITHFP-NEXT: addi a7, t3, -328 +; ILP32E-WITHFP-NEXT: addi t0, t4, 1311 +; ILP32E-WITHFP-NEXT: addi a5, t5, -2048 +; ILP32E-WITHFP-NEXT: sw t0, 32(sp) +; ILP32E-WITHFP-NEXT: sw a7, 36(sp) +; ILP32E-WITHFP-NEXT: sw a6, 40(sp) +; ILP32E-WITHFP-NEXT: sw a4, 44(sp) ; ILP32E-WITHFP-NEXT: li a4, 13 -; ILP32E-WITHFP-NEXT: addi a6, t2, 491 -; ILP32E-WITHFP-NEXT: addi a7, t3, -1967 -; ILP32E-WITHFP-NEXT: addi t0, t4, -328 -; ILP32E-WITHFP-NEXT: addi t1, t5, 1311 -; ILP32E-WITHFP-NEXT: addi a5, t6, -2048 -; ILP32E-WITHFP-NEXT: sw t1, 32(sp) -; ILP32E-WITHFP-NEXT: sw t0, 36(sp) -; ILP32E-WITHFP-NEXT: sw a7, 40(sp) -; ILP32E-WITHFP-NEXT: sw a6, 44(sp) ; ILP32E-WITHFP-NEXT: call va5_aligned_stack_callee ; ILP32E-WITHFP-NEXT: addi sp, s0, -64 ; ILP32E-WITHFP-NEXT: lw ra, 60(sp) # 4-byte Folded Reload @@ -2505,27 +2505,27 @@ define void @va5_aligned_stack_caller() nounwind { ; LP64-LP64F-LP64D-FPELIM: # %bb.0: ; LP64-LP64F-LP64D-FPELIM-NEXT: addi sp, sp, -48 ; LP64-LP64F-LP64D-FPELIM-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; LP64-LP64F-LP64D-FPELIM-NEXT: li t0, 17 -; LP64-LP64F-LP64D-FPELIM-NEXT: li t1, 16 -; LP64-LP64F-LP64D-FPELIM-NEXT: li t2, 15 +; LP64-LP64F-LP64D-FPELIM-NEXT: li a7, 17 +; LP64-LP64F-LP64D-FPELIM-NEXT: li t0, 16 +; LP64-LP64F-LP64D-FPELIM-NEXT: li t1, 15 ; LP64-LP64F-LP64D-FPELIM-NEXT: lui a2, %hi(.LCPI11_0) ; LP64-LP64F-LP64D-FPELIM-NEXT: lui a3, %hi(.LCPI11_1) ; LP64-LP64F-LP64D-FPELIM-NEXT: lui a6, %hi(.LCPI11_2) -; LP64-LP64F-LP64D-FPELIM-NEXT: lui t3, 2384 +; LP64-LP64F-LP64D-FPELIM-NEXT: lui t2, 2384 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a0, 1 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a1, 11 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a4, 12 ; LP64-LP64F-LP64D-FPELIM-NEXT: li a5, 13 -; LP64-LP64F-LP64D-FPELIM-NEXT: li a7, 14 -; LP64-LP64F-LP64D-FPELIM-NEXT: ld t4, %lo(.LCPI11_0)(a2) +; LP64-LP64F-LP64D-FPELIM-NEXT: ld t3, %lo(.LCPI11_0)(a2) ; LP64-LP64F-LP64D-FPELIM-NEXT: ld a2, %lo(.LCPI11_1)(a3) ; LP64-LP64F-LP64D-FPELIM-NEXT: ld a3, %lo(.LCPI11_2)(a6) -; LP64-LP64F-LP64D-FPELIM-NEXT: addiw a6, t3, 761 +; LP64-LP64F-LP64D-FPELIM-NEXT: addiw a6, t2, 761 ; LP64-LP64F-LP64D-FPELIM-NEXT: slli a6, a6, 11 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd t4, 0(sp) -; LP64-LP64F-LP64D-FPELIM-NEXT: sd t2, 8(sp) -; LP64-LP64F-LP64D-FPELIM-NEXT: sd t1, 16(sp) -; LP64-LP64F-LP64D-FPELIM-NEXT: sd t0, 24(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: sd t3, 0(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: sd t1, 8(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: sd t0, 16(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a7, 24(sp) +; LP64-LP64F-LP64D-FPELIM-NEXT: li a7, 14 ; LP64-LP64F-LP64D-FPELIM-NEXT: call va5_aligned_stack_callee ; LP64-LP64F-LP64D-FPELIM-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-FPELIM-NEXT: addi sp, sp, 48 @@ -2537,27 +2537,27 @@ define void @va5_aligned_stack_caller() nounwind { ; LP64-LP64F-LP64D-WITHFP-NEXT: sd ra, 40(sp) # 8-byte Folded Spill ; LP64-LP64F-LP64D-WITHFP-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; LP64-LP64F-LP64D-WITHFP-NEXT: addi s0, sp, 48 -; LP64-LP64F-LP64D-WITHFP-NEXT: li t0, 17 -; LP64-LP64F-LP64D-WITHFP-NEXT: li t1, 16 -; LP64-LP64F-LP64D-WITHFP-NEXT: li t2, 15 +; LP64-LP64F-LP64D-WITHFP-NEXT: li a7, 17 +; LP64-LP64F-LP64D-WITHFP-NEXT: li t0, 16 +; LP64-LP64F-LP64D-WITHFP-NEXT: li t1, 15 ; LP64-LP64F-LP64D-WITHFP-NEXT: lui a2, %hi(.LCPI11_0) ; LP64-LP64F-LP64D-WITHFP-NEXT: lui a3, %hi(.LCPI11_1) ; LP64-LP64F-LP64D-WITHFP-NEXT: lui a6, %hi(.LCPI11_2) -; LP64-LP64F-LP64D-WITHFP-NEXT: lui t3, 2384 +; LP64-LP64F-LP64D-WITHFP-NEXT: lui t2, 2384 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a0, 1 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a1, 11 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a4, 12 ; LP64-LP64F-LP64D-WITHFP-NEXT: li a5, 13 -; LP64-LP64F-LP64D-WITHFP-NEXT: li a7, 14 -; LP64-LP64F-LP64D-WITHFP-NEXT: ld t4, %lo(.LCPI11_0)(a2) +; LP64-LP64F-LP64D-WITHFP-NEXT: ld t3, %lo(.LCPI11_0)(a2) ; LP64-LP64F-LP64D-WITHFP-NEXT: ld a2, %lo(.LCPI11_1)(a3) ; LP64-LP64F-LP64D-WITHFP-NEXT: ld a3, %lo(.LCPI11_2)(a6) -; LP64-LP64F-LP64D-WITHFP-NEXT: addiw a6, t3, 761 +; LP64-LP64F-LP64D-WITHFP-NEXT: addiw a6, t2, 761 ; LP64-LP64F-LP64D-WITHFP-NEXT: slli a6, a6, 11 -; LP64-LP64F-LP64D-WITHFP-NEXT: sd t4, 0(sp) -; LP64-LP64F-LP64D-WITHFP-NEXT: sd t2, 8(sp) -; LP64-LP64F-LP64D-WITHFP-NEXT: sd t1, 16(sp) -; LP64-LP64F-LP64D-WITHFP-NEXT: sd t0, 24(sp) +; LP64-LP64F-LP64D-WITHFP-NEXT: sd t3, 0(sp) +; LP64-LP64F-LP64D-WITHFP-NEXT: sd t1, 8(sp) +; LP64-LP64F-LP64D-WITHFP-NEXT: sd t0, 16(sp) +; LP64-LP64F-LP64D-WITHFP-NEXT: sd a7, 24(sp) +; LP64-LP64F-LP64D-WITHFP-NEXT: li a7, 14 ; LP64-LP64F-LP64D-WITHFP-NEXT: call va5_aligned_stack_callee ; LP64-LP64F-LP64D-WITHFP-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-WITHFP-NEXT: ld s0, 32(sp) # 8-byte Folded Reload @@ -2570,27 +2570,27 @@ define void @va5_aligned_stack_caller() nounwind { ; LP64E-FPELIM-NEXT: sd ra, 48(sp) # 8-byte Folded Spill ; LP64E-FPELIM-NEXT: li a2, 17 ; LP64E-FPELIM-NEXT: li a3, 16 -; LP64E-FPELIM-NEXT: li a6, 15 -; LP64E-FPELIM-NEXT: lui a7, %hi(.LCPI11_0) -; LP64E-FPELIM-NEXT: li t0, 14 -; LP64E-FPELIM-NEXT: lui t1, 2384 -; LP64E-FPELIM-NEXT: lui t2, %hi(.LCPI11_1) -; LP64E-FPELIM-NEXT: lui t3, %hi(.LCPI11_2) +; LP64E-FPELIM-NEXT: li a5, 15 +; LP64E-FPELIM-NEXT: lui a6, %hi(.LCPI11_0) +; LP64E-FPELIM-NEXT: li a7, 14 +; LP64E-FPELIM-NEXT: lui t0, 2384 +; LP64E-FPELIM-NEXT: lui t1, %hi(.LCPI11_1) +; LP64E-FPELIM-NEXT: lui t2, %hi(.LCPI11_2) ; LP64E-FPELIM-NEXT: li a0, 1 ; LP64E-FPELIM-NEXT: li a1, 11 -; LP64E-FPELIM-NEXT: li a4, 12 ; LP64E-FPELIM-NEXT: sd a3, 32(sp) ; LP64E-FPELIM-NEXT: sd a2, 40(sp) +; LP64E-FPELIM-NEXT: li a4, 12 +; LP64E-FPELIM-NEXT: ld a6, %lo(.LCPI11_0)(a6) +; LP64E-FPELIM-NEXT: addiw t0, t0, 761 +; LP64E-FPELIM-NEXT: ld a2, %lo(.LCPI11_1)(t1) +; LP64E-FPELIM-NEXT: ld a3, %lo(.LCPI11_2)(t2) +; LP64E-FPELIM-NEXT: slli t0, t0, 11 +; LP64E-FPELIM-NEXT: sd t0, 0(sp) +; LP64E-FPELIM-NEXT: sd a7, 8(sp) +; LP64E-FPELIM-NEXT: sd a6, 16(sp) +; LP64E-FPELIM-NEXT: sd a5, 24(sp) ; LP64E-FPELIM-NEXT: li a5, 13 -; LP64E-FPELIM-NEXT: ld a7, %lo(.LCPI11_0)(a7) -; LP64E-FPELIM-NEXT: addiw t1, t1, 761 -; LP64E-FPELIM-NEXT: ld a2, %lo(.LCPI11_1)(t2) -; LP64E-FPELIM-NEXT: ld a3, %lo(.LCPI11_2)(t3) -; LP64E-FPELIM-NEXT: slli t1, t1, 11 -; LP64E-FPELIM-NEXT: sd t1, 0(sp) -; LP64E-FPELIM-NEXT: sd t0, 8(sp) -; LP64E-FPELIM-NEXT: sd a7, 16(sp) -; LP64E-FPELIM-NEXT: sd a6, 24(sp) ; LP64E-FPELIM-NEXT: call va5_aligned_stack_callee ; LP64E-FPELIM-NEXT: ld ra, 48(sp) # 8-byte Folded Reload ; LP64E-FPELIM-NEXT: addi sp, sp, 56 @@ -2604,27 +2604,27 @@ define void @va5_aligned_stack_caller() nounwind { ; LP64E-WITHFP-NEXT: addi s0, sp, 64 ; LP64E-WITHFP-NEXT: li a2, 17 ; LP64E-WITHFP-NEXT: li a3, 16 -; LP64E-WITHFP-NEXT: li a6, 15 -; LP64E-WITHFP-NEXT: lui a7, %hi(.LCPI11_0) -; LP64E-WITHFP-NEXT: li t0, 14 -; LP64E-WITHFP-NEXT: lui t1, 2384 -; LP64E-WITHFP-NEXT: lui t2, %hi(.LCPI11_1) -; LP64E-WITHFP-NEXT: lui t3, %hi(.LCPI11_2) +; LP64E-WITHFP-NEXT: li a5, 15 +; LP64E-WITHFP-NEXT: lui a6, %hi(.LCPI11_0) +; LP64E-WITHFP-NEXT: li a7, 14 +; LP64E-WITHFP-NEXT: lui t0, 2384 +; LP64E-WITHFP-NEXT: lui t1, %hi(.LCPI11_1) +; LP64E-WITHFP-NEXT: lui t2, %hi(.LCPI11_2) ; LP64E-WITHFP-NEXT: li a0, 1 ; LP64E-WITHFP-NEXT: li a1, 11 -; LP64E-WITHFP-NEXT: li a4, 12 ; LP64E-WITHFP-NEXT: sd a3, 32(sp) ; LP64E-WITHFP-NEXT: sd a2, 40(sp) +; LP64E-WITHFP-NEXT: li a4, 12 +; LP64E-WITHFP-NEXT: ld a6, %lo(.LCPI11_0)(a6) +; LP64E-WITHFP-NEXT: addiw t0, t0, 761 +; LP64E-WITHFP-NEXT: ld a2, %lo(.LCPI11_1)(t1) +; LP64E-WITHFP-NEXT: ld a3, %lo(.LCPI11_2)(t2) +; LP64E-WITHFP-NEXT: slli t0, t0, 11 +; LP64E-WITHFP-NEXT: sd t0, 0(sp) +; LP64E-WITHFP-NEXT: sd a7, 8(sp) +; LP64E-WITHFP-NEXT: sd a6, 16(sp) +; LP64E-WITHFP-NEXT: sd a5, 24(sp) ; LP64E-WITHFP-NEXT: li a5, 13 -; LP64E-WITHFP-NEXT: ld a7, %lo(.LCPI11_0)(a7) -; LP64E-WITHFP-NEXT: addiw t1, t1, 761 -; LP64E-WITHFP-NEXT: ld a2, %lo(.LCPI11_1)(t2) -; LP64E-WITHFP-NEXT: ld a3, %lo(.LCPI11_2)(t3) -; LP64E-WITHFP-NEXT: slli t1, t1, 11 -; LP64E-WITHFP-NEXT: sd t1, 0(sp) -; LP64E-WITHFP-NEXT: sd t0, 8(sp) -; LP64E-WITHFP-NEXT: sd a7, 16(sp) -; LP64E-WITHFP-NEXT: sd a6, 24(sp) ; LP64E-WITHFP-NEXT: call va5_aligned_stack_callee ; LP64E-WITHFP-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; LP64E-WITHFP-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -2994,8 +2994,26 @@ define i32 @va_large_stack(ptr %fmt, ...) { ; LP64-LP64F-LP64D-FPELIM-NEXT: .cfi_def_cfa_offset 100000080 ; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 24414 ; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, sp, a0 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a5, 312(a0) +; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 24414 +; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, sp, a0 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a6, 320(a0) +; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 24414 +; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, sp, a0 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a7, 328(a0) +; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 24414 +; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, sp, a0 ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a1, 280(a0) ; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 24414 +; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, sp, a0 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a2, 288(a0) +; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 24414 +; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, sp, a0 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a3, 296(a0) +; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 24414 +; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, sp, a0 +; LP64-LP64F-LP64D-FPELIM-NEXT: sd a4, 304(a0) +; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 24414 ; LP64-LP64F-LP64D-FPELIM-NEXT: addiw a0, a0, 284 ; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, sp, a0 ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a0, 8(sp) @@ -3003,24 +3021,6 @@ define i32 @va_large_stack(ptr %fmt, ...) { ; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, sp, a0 ; LP64-LP64F-LP64D-FPELIM-NEXT: lw a0, 280(a0) ; LP64-LP64F-LP64D-FPELIM-NEXT: lui a1, 24414 -; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, sp, a1 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a5, 312(a1) -; LP64-LP64F-LP64D-FPELIM-NEXT: lui a1, 24414 -; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, sp, a1 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a6, 320(a1) -; LP64-LP64F-LP64D-FPELIM-NEXT: lui a1, 24414 -; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, sp, a1 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a7, 328(a1) -; LP64-LP64F-LP64D-FPELIM-NEXT: lui a1, 24414 -; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, sp, a1 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a2, 288(a1) -; LP64-LP64F-LP64D-FPELIM-NEXT: lui a1, 24414 -; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, sp, a1 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a3, 296(a1) -; LP64-LP64F-LP64D-FPELIM-NEXT: lui a1, 24414 -; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, sp, a1 -; LP64-LP64F-LP64D-FPELIM-NEXT: sd a4, 304(a1) -; LP64-LP64F-LP64D-FPELIM-NEXT: lui a1, 24414 ; LP64-LP64F-LP64D-FPELIM-NEXT: addiw a1, a1, 336 ; LP64-LP64F-LP64D-FPELIM-NEXT: add sp, sp, a1 ; LP64-LP64F-LP64D-FPELIM-NEXT: .cfi_def_cfa_offset 0 @@ -3039,18 +3039,18 @@ define i32 @va_large_stack(ptr %fmt, ...) { ; LP64-LP64F-LP64D-WITHFP-NEXT: lui a0, 24414 ; LP64-LP64F-LP64D-WITHFP-NEXT: addiw a0, a0, -1680 ; LP64-LP64F-LP64D-WITHFP-NEXT: sub sp, sp, a0 -; LP64-LP64F-LP64D-WITHFP-NEXT: sd a1, 8(s0) -; LP64-LP64F-LP64D-WITHFP-NEXT: addi a0, s0, 12 -; LP64-LP64F-LP64D-WITHFP-NEXT: lui a1, 24414 -; LP64-LP64F-LP64D-WITHFP-NEXT: sub a1, s0, a1 -; LP64-LP64F-LP64D-WITHFP-NEXT: sd a0, -288(a1) -; LP64-LP64F-LP64D-WITHFP-NEXT: lw a0, 8(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a5, 40(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a6, 48(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a7, 56(s0) +; LP64-LP64F-LP64D-WITHFP-NEXT: sd a1, 8(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a2, 16(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a3, 24(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a4, 32(s0) +; LP64-LP64F-LP64D-WITHFP-NEXT: addi a0, s0, 12 +; LP64-LP64F-LP64D-WITHFP-NEXT: lui a1, 24414 +; LP64-LP64F-LP64D-WITHFP-NEXT: sub a1, s0, a1 +; LP64-LP64F-LP64D-WITHFP-NEXT: sd a0, -288(a1) +; LP64-LP64F-LP64D-WITHFP-NEXT: lw a0, 8(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: lui a1, 24414 ; LP64-LP64F-LP64D-WITHFP-NEXT: addiw a1, a1, -1680 ; LP64-LP64F-LP64D-WITHFP-NEXT: add sp, sp, a1 @@ -3070,28 +3070,28 @@ define i32 @va_large_stack(ptr %fmt, ...) { ; LP64E-FPELIM-NEXT: sub sp, sp, a0 ; LP64E-FPELIM-NEXT: .cfi_def_cfa_offset 100000064 ; LP64E-FPELIM-NEXT: lui a0, 24414 -; LP64E-FPELIM-NEXT: addiw a0, a0, 284 ; LP64E-FPELIM-NEXT: add a0, sp, a0 -; LP64E-FPELIM-NEXT: sd a0, 8(sp) +; LP64E-FPELIM-NEXT: sd a5, 312(a0) ; LP64E-FPELIM-NEXT: lui a0, 24414 ; LP64E-FPELIM-NEXT: add a0, sp, a0 ; LP64E-FPELIM-NEXT: sd a1, 280(a0) ; LP64E-FPELIM-NEXT: lui a0, 24414 ; LP64E-FPELIM-NEXT: add a0, sp, a0 +; LP64E-FPELIM-NEXT: sd a2, 288(a0) +; LP64E-FPELIM-NEXT: lui a0, 24414 +; LP64E-FPELIM-NEXT: add a0, sp, a0 +; LP64E-FPELIM-NEXT: sd a3, 296(a0) +; LP64E-FPELIM-NEXT: lui a0, 24414 +; LP64E-FPELIM-NEXT: add a0, sp, a0 +; LP64E-FPELIM-NEXT: sd a4, 304(a0) +; LP64E-FPELIM-NEXT: lui a0, 24414 +; LP64E-FPELIM-NEXT: addiw a0, a0, 284 +; LP64E-FPELIM-NEXT: add a0, sp, a0 +; LP64E-FPELIM-NEXT: sd a0, 8(sp) +; LP64E-FPELIM-NEXT: lui a0, 24414 +; LP64E-FPELIM-NEXT: add a0, sp, a0 ; LP64E-FPELIM-NEXT: lw a0, 280(a0) ; LP64E-FPELIM-NEXT: lui a1, 24414 -; LP64E-FPELIM-NEXT: add a1, sp, a1 -; LP64E-FPELIM-NEXT: sd a5, 312(a1) -; LP64E-FPELIM-NEXT: lui a1, 24414 -; LP64E-FPELIM-NEXT: add a1, sp, a1 -; LP64E-FPELIM-NEXT: sd a2, 288(a1) -; LP64E-FPELIM-NEXT: lui a1, 24414 -; LP64E-FPELIM-NEXT: add a1, sp, a1 -; LP64E-FPELIM-NEXT: sd a3, 296(a1) -; LP64E-FPELIM-NEXT: lui a1, 24414 -; LP64E-FPELIM-NEXT: add a1, sp, a1 -; LP64E-FPELIM-NEXT: sd a4, 304(a1) -; LP64E-FPELIM-NEXT: lui a1, 24414 ; LP64E-FPELIM-NEXT: addiw a1, a1, 320 ; LP64E-FPELIM-NEXT: add sp, sp, a1 ; LP64E-FPELIM-NEXT: .cfi_def_cfa_offset 0 @@ -3110,16 +3110,16 @@ define i32 @va_large_stack(ptr %fmt, ...) { ; LP64E-WITHFP-NEXT: lui a0, 24414 ; LP64E-WITHFP-NEXT: addiw a0, a0, -1704 ; LP64E-WITHFP-NEXT: sub sp, sp, a0 -; LP64E-WITHFP-NEXT: addi a0, s0, 12 -; LP64E-WITHFP-NEXT: lui a6, 24414 -; LP64E-WITHFP-NEXT: sub a6, s0, a6 -; LP64E-WITHFP-NEXT: sd a0, -288(a6) -; LP64E-WITHFP-NEXT: sd a1, 8(s0) -; LP64E-WITHFP-NEXT: lw a0, 8(s0) ; LP64E-WITHFP-NEXT: sd a5, 40(s0) +; LP64E-WITHFP-NEXT: sd a1, 8(s0) ; LP64E-WITHFP-NEXT: sd a2, 16(s0) ; LP64E-WITHFP-NEXT: sd a3, 24(s0) ; LP64E-WITHFP-NEXT: sd a4, 32(s0) +; LP64E-WITHFP-NEXT: addi a0, s0, 12 +; LP64E-WITHFP-NEXT: lui a1, 24414 +; LP64E-WITHFP-NEXT: sub a1, s0, a1 +; LP64E-WITHFP-NEXT: sd a0, -288(a1) +; LP64E-WITHFP-NEXT: lw a0, 8(s0) ; LP64E-WITHFP-NEXT: lui a1, 24414 ; LP64E-WITHFP-NEXT: addiw a1, a1, -1704 ; LP64E-WITHFP-NEXT: add sp, sp, a1 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index 437b7e557718c..13beb844dec36 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -9,9 +9,9 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lb a0, 3(a0) -; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 @@ -29,26 +29,26 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_4bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 2(a0) -; RV32I-NEXT: lbu a5, 3(a0) -; RV32I-NEXT: lbu a0, 0(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 @@ -73,9 +73,9 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lb a0, 3(a0) -; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 @@ -93,26 +93,26 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_4bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 2(a0) -; RV32I-NEXT: lbu a5, 3(a0) -; RV32I-NEXT: lbu a0, 0(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 @@ -137,9 +137,9 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lb a0, 3(a0) -; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 ; RV64I-NEXT: slli a1, a1, 3 @@ -157,26 +157,26 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_4bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 2(a0) -; RV32I-NEXT: lbu a5, 3(a0) -; RV32I-NEXT: lbu a0, 0(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 @@ -224,20 +224,20 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 0(a1) ; RV64I-NEXT: lbu t1, 1(a1) -; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: lbu t2, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) ; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a4, a4, 35 @@ -263,40 +263,40 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_8bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a5, 7(a0) -; RV32I-NEXT: lbu a6, 4(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a3, a6 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a3, 4(a0) +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 6(a0) +; RV32I-NEXT: lbu a6, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a5, a4, a3 -; RV32I-NEXT: or a4, a1, a6 -; RV32I-NEXT: slli a4, a4, 3 -; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a5, a5, a3 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: slli a4, a1, 3 ; RV32I-NEXT: srl a1, a5, a4 +; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: bltz a3, .LBB3_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: j .LBB3_3 ; RV32I-NEXT: .LBB3_2: -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) +; RV32I-NEXT: lbu a6, 0(a0) +; RV32I-NEXT: lbu a7, 1(a0) ; RV32I-NEXT: lbu t0, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 ; RV32I-NEXT: slli a5, a5, 1 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 @@ -360,20 +360,20 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 0(a1) ; RV64I-NEXT: lbu t1, 1(a1) -; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: lbu t2, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) ; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a4, a4, 35 @@ -399,40 +399,40 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_8bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 2(a0) -; RV32I-NEXT: lbu a5, 3(a0) -; RV32I-NEXT: lbu a6, 0(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a3, a6 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a5, a4, a3 -; RV32I-NEXT: or a4, a1, a6 -; RV32I-NEXT: slli a4, a4, 3 -; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a5, a5, a3 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: slli a4, a1, 3 ; RV32I-NEXT: sll a1, a5, a4 +; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: bltz a3, .LBB4_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: j .LBB4_3 ; RV32I-NEXT: .LBB4_2: -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu a6, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) ; RV32I-NEXT: lbu t0, 6(a0) ; RV32I-NEXT: lbu a0, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 ; RV32I-NEXT: srli a5, a5, 1 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 @@ -496,20 +496,20 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 0(a1) ; RV64I-NEXT: lbu t1, 1(a1) -; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: lbu t2, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) ; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a4, a4, 35 @@ -535,42 +535,41 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_8bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu a3, 4(a0) +; RV32I-NEXT: lbu a4, 5(a0) ; RV32I-NEXT: lbu a5, 6(a0) ; RV32I-NEXT: lbu a6, 7(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: lbu a7, 0(a1) -; RV32I-NEXT: lbu t0, 1(a1) -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 2(a1) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t0, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, a4 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, t0 ; RV32I-NEXT: slli a4, a5, 16 ; RV32I-NEXT: slli a5, a6, 24 ; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: or a4, a4, a3 ; RV32I-NEXT: or a3, a1, a7 ; RV32I-NEXT: slli a3, a3, 3 -; RV32I-NEXT: addi a6, a3, -32 ; RV32I-NEXT: sra a1, a4, a3 +; RV32I-NEXT: addi a6, a3, -32 ; RV32I-NEXT: bltz a6, .LBB5_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a5, a5, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: srai a1, a5, 31 ; RV32I-NEXT: j .LBB5_3 ; RV32I-NEXT: .LBB5_2: -; RV32I-NEXT: lbu a5, 1(a0) -; RV32I-NEXT: lbu a6, 0(a0) +; RV32I-NEXT: lbu a5, 0(a0) +; RV32I-NEXT: lbu a6, 1(a0) ; RV32I-NEXT: lbu a7, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: slli a4, a4, 1 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a0, a0, 24 @@ -633,54 +632,54 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t3, t3, 24 ; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 0(a1) ; RV64I-NEXT: lbu t2, 1(a1) -; RV64I-NEXT: or t0, t3, t0 ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) ; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a5, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a6, a5, 35 ; RV64I-NEXT: or a5, a4, a3 ; RV64I-NEXT: or a4, a6, a1 -; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: srl a1, a5, a4 +; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: bltz a3, .LBB6_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB6_3 ; RV64I-NEXT: .LBB6_2: -; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 2(a0) -; RV64I-NEXT: lbu t0, 3(a0) -; RV64I-NEXT: lbu t1, 0(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, a6, t1 -; RV64I-NEXT: lbu t1, 4(a0) -; RV64I-NEXT: lbu t2, 5(a0) -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: lbu a6, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t0, 2(a0) +; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 5(a0) +; RV64I-NEXT: lbu t2, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, t2 ; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: srl a0, a0, a4 @@ -787,10 +786,10 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: andi a1, a1, 12 ; RV32I-NEXT: add a1, t2, a1 ; RV32I-NEXT: andi a3, a0, 24 +; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: lw a4, 0(a1) ; RV32I-NEXT: lw a5, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: lw a1, 12(a1) ; RV32I-NEXT: srl a7, a5, a0 ; RV32I-NEXT: slli t0, a6, 1 @@ -872,54 +871,54 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: slli t3, t3, 24 ; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 0(a1) ; RV64I-NEXT: lbu t2, 1(a1) -; RV64I-NEXT: or t0, t3, t0 ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) ; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a5, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 5 ; RV64I-NEXT: slli a6, a5, 37 ; RV64I-NEXT: or a5, a4, a3 ; RV64I-NEXT: or a4, a6, a1 -; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: srl a1, a5, a4 +; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: bltz a3, .LBB7_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB7_3 ; RV64I-NEXT: .LBB7_2: -; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 2(a0) -; RV64I-NEXT: lbu t0, 3(a0) -; RV64I-NEXT: lbu t1, 0(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, a6, t1 -; RV64I-NEXT: lbu t1, 4(a0) -; RV64I-NEXT: lbu t2, 5(a0) -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: lbu a6, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t0, 2(a0) +; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 5(a0) +; RV64I-NEXT: lbu t2, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, t2 ; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: srl a0, a0, a4 @@ -1016,38 +1015,38 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a0, 12(sp) -; RV32I-NEXT: lw a0, 8(a1) +; RV32I-NEXT: lw a0, 0(a1) ; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: srli a5, a0, 16 -; RV32I-NEXT: srli a6, a0, 24 -; RV32I-NEXT: srli a7, a0, 8 +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: srli a6, a4, 24 +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: srli t0, a1, 16 ; RV32I-NEXT: srli t1, a1, 24 ; RV32I-NEXT: srli t2, a1, 8 -; RV32I-NEXT: srli t3, a4, 16 -; RV32I-NEXT: srli t4, a4, 24 -; RV32I-NEXT: srli t5, a4, 8 +; RV32I-NEXT: srli t3, a0, 16 +; RV32I-NEXT: srli t4, a0, 24 +; RV32I-NEXT: srli t5, a0, 8 ; RV32I-NEXT: srli t6, a3, 16 -; RV32I-NEXT: sb a0, 8(a2) +; RV32I-NEXT: sb a4, 8(a2) ; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb a5, 10(a2) ; RV32I-NEXT: sb a6, 11(a2) -; RV32I-NEXT: srli a0, a3, 24 +; RV32I-NEXT: srli a4, a3, 24 ; RV32I-NEXT: sb a1, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) ; RV32I-NEXT: sb t1, 15(a2) ; RV32I-NEXT: srli a1, a3, 8 -; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb a0, 0(a2) ; RV32I-NEXT: sb t5, 1(a2) ; RV32I-NEXT: sb t3, 2(a2) ; RV32I-NEXT: sb t4, 3(a2) ; RV32I-NEXT: sb a3, 4(a2) ; RV32I-NEXT: sb a1, 5(a2) ; RV32I-NEXT: sb t6, 6(a2) -; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: sb a4, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1087,54 +1086,54 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t3, t3, 24 ; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 0(a1) ; RV64I-NEXT: lbu t2, 1(a1) -; RV64I-NEXT: or t0, t3, t0 ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) ; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a5, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a6, a5, 35 ; RV64I-NEXT: or a5, a4, a3 ; RV64I-NEXT: or a4, a6, a1 -; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: sll a1, a5, a4 +; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: bltz a3, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB8_3 ; RV64I-NEXT: .LBB8_2: -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 10(a0) -; RV64I-NEXT: lbu t0, 11(a0) -; RV64I-NEXT: lbu t1, 8(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, a6, t1 -; RV64I-NEXT: lbu t1, 12(a0) -; RV64I-NEXT: lbu t2, 13(a0) -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 14(a0) -; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: lbu a6, 8(a0) +; RV64I-NEXT: lbu a7, 9(a0) +; RV64I-NEXT: lbu t0, 10(a0) +; RV64I-NEXT: lbu t1, 11(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: lbu t0, 12(a0) +; RV64I-NEXT: lbu t1, 13(a0) +; RV64I-NEXT: lbu t2, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, t2 ; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: srli a5, a5, 1 -; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: sll a0, a0, a4 @@ -1241,11 +1240,11 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: andi a1, a1, 12 ; RV32I-NEXT: sub a1, t2, a1 ; RV32I-NEXT: andi a3, a0, 24 +; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: lw a4, 0(a1) ; RV32I-NEXT: lw a5, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: sll a7, a5, a0 ; RV32I-NEXT: srli t0, a4, 1 ; RV32I-NEXT: sll a1, a1, a0 @@ -1326,54 +1325,54 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: slli t3, t3, 24 ; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 0(a1) ; RV64I-NEXT: lbu t2, 1(a1) -; RV64I-NEXT: or t0, t3, t0 ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) ; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a5, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 5 ; RV64I-NEXT: slli a6, a5, 37 ; RV64I-NEXT: or a5, a4, a3 ; RV64I-NEXT: or a4, a6, a1 -; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: sll a1, a5, a4 +; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: bltz a3, .LBB9_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB9_3 ; RV64I-NEXT: .LBB9_2: -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 10(a0) -; RV64I-NEXT: lbu t0, 11(a0) -; RV64I-NEXT: lbu t1, 8(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, a6, t1 -; RV64I-NEXT: lbu t1, 12(a0) -; RV64I-NEXT: lbu t2, 13(a0) -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 14(a0) -; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: lbu a6, 8(a0) +; RV64I-NEXT: lbu a7, 9(a0) +; RV64I-NEXT: lbu t0, 10(a0) +; RV64I-NEXT: lbu t1, 11(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: lbu t0, 12(a0) +; RV64I-NEXT: lbu t1, 13(a0) +; RV64I-NEXT: lbu t2, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, t2 ; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: srli a5, a5, 1 -; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: sll a0, a0, a4 @@ -1470,38 +1469,38 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sw a4, 20(sp) ; RV32I-NEXT: sw a5, 24(sp) ; RV32I-NEXT: sw a0, 28(sp) -; RV32I-NEXT: lw a0, 8(a1) +; RV32I-NEXT: lw a0, 0(a1) ; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: srli a5, a0, 16 -; RV32I-NEXT: srli a6, a0, 24 -; RV32I-NEXT: srli a7, a0, 8 +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: srli a6, a4, 24 +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: srli t0, a1, 16 ; RV32I-NEXT: srli t1, a1, 24 ; RV32I-NEXT: srli t2, a1, 8 -; RV32I-NEXT: srli t3, a4, 16 -; RV32I-NEXT: srli t4, a4, 24 -; RV32I-NEXT: srli t5, a4, 8 +; RV32I-NEXT: srli t3, a0, 16 +; RV32I-NEXT: srli t4, a0, 24 +; RV32I-NEXT: srli t5, a0, 8 ; RV32I-NEXT: srli t6, a3, 16 -; RV32I-NEXT: sb a0, 8(a2) +; RV32I-NEXT: sb a4, 8(a2) ; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb a5, 10(a2) ; RV32I-NEXT: sb a6, 11(a2) -; RV32I-NEXT: srli a0, a3, 24 +; RV32I-NEXT: srli a4, a3, 24 ; RV32I-NEXT: sb a1, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) ; RV32I-NEXT: sb t1, 15(a2) ; RV32I-NEXT: srli a1, a3, 8 -; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb a0, 0(a2) ; RV32I-NEXT: sb t5, 1(a2) ; RV32I-NEXT: sb t3, 2(a2) ; RV32I-NEXT: sb t4, 3(a2) ; RV32I-NEXT: sb a3, 4(a2) ; RV32I-NEXT: sb a1, 5(a2) ; RV32I-NEXT: sb t6, 6(a2) -; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: sb a4, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -1542,56 +1541,55 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t3, t3, 24 ; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 0(a1) ; RV64I-NEXT: lbu t2, 1(a1) -; RV64I-NEXT: or t0, t3, t0 ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) ; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, t1, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a6, a5, 32 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a7, a4, 35 ; RV64I-NEXT: or a4, a6, a3 ; RV64I-NEXT: or a3, a7, a1 -; RV64I-NEXT: addi a6, a3, -64 ; RV64I-NEXT: sra a1, a4, a3 +; RV64I-NEXT: addi a6, a3, -64 ; RV64I-NEXT: bltz a6, .LBB10_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sraiw a3, a5, 31 ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: sraiw a1, a5, 31 ; RV64I-NEXT: j .LBB10_3 ; RV64I-NEXT: .LBB10_2: -; RV64I-NEXT: lbu a5, 1(a0) -; RV64I-NEXT: lbu a6, 2(a0) -; RV64I-NEXT: lbu a7, 3(a0) -; RV64I-NEXT: lbu t0, 0(a0) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a5, t0 -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu t1, 5(a0) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: lbu a6, 1(a0) +; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: not a6, a3 ; RV64I-NEXT: slli a4, a4, 1 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: srl a0, a0, a3 @@ -1665,17 +1663,17 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t1, t1, 8 ; RV32I-NEXT: or a4, t3, a4 ; RV32I-NEXT: or t3, t5, t4 -; RV32I-NEXT: lbu t4, 0(a1) -; RV32I-NEXT: lbu t5, 1(a1) ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: lbu t1, 2(a1) +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t4, 1(a1) +; RV32I-NEXT: lbu t5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or t1, t4, t1 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t1 -; RV32I-NEXT: mv t1, sp +; RV32I-NEXT: or a1, a1, t5 +; RV32I-NEXT: mv t4, sp ; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or t2, a0, t2 @@ -1684,7 +1682,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a5, a7, a6 ; RV32I-NEXT: or a4, t3, a4 ; RV32I-NEXT: or a6, t2, t0 -; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: sw a0, 16(sp) ; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: sw a0, 24(sp) @@ -1695,12 +1693,12 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw a6, 12(sp) ; RV32I-NEXT: slli a0, a1, 3 ; RV32I-NEXT: andi a1, a1, 12 -; RV32I-NEXT: add a1, t1, a1 +; RV32I-NEXT: add a1, t4, a1 ; RV32I-NEXT: andi a3, a0, 24 +; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: lw a4, 0(a1) ; RV32I-NEXT: lw a5, 4(a1) ; RV32I-NEXT: lw a6, 8(a1) -; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: lw a1, 12(a1) ; RV32I-NEXT: srl a7, a5, a0 ; RV32I-NEXT: slli t0, a6, 1 @@ -1782,56 +1780,55 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: slli t3, t3, 24 ; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 0(a1) ; RV64I-NEXT: lbu t2, 1(a1) -; RV64I-NEXT: or t0, t3, t0 ; RV64I-NEXT: lbu t3, 2(a1) ; RV64I-NEXT: lbu a1, 3(a1) ; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, t1, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a6, a5, 32 ; RV64I-NEXT: slli a1, a1, 5 ; RV64I-NEXT: slli a7, a4, 37 ; RV64I-NEXT: or a4, a6, a3 ; RV64I-NEXT: or a3, a7, a1 -; RV64I-NEXT: addi a6, a3, -64 ; RV64I-NEXT: sra a1, a4, a3 +; RV64I-NEXT: addi a6, a3, -64 ; RV64I-NEXT: bltz a6, .LBB11_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sraiw a3, a5, 31 ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: sraiw a1, a5, 31 ; RV64I-NEXT: j .LBB11_3 ; RV64I-NEXT: .LBB11_2: -; RV64I-NEXT: lbu a5, 1(a0) -; RV64I-NEXT: lbu a6, 2(a0) -; RV64I-NEXT: lbu a7, 3(a0) -; RV64I-NEXT: lbu t0, 0(a0) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a5, t0 -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu t1, 5(a0) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: lbu a6, 1(a0) +; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: not a6, a3 ; RV64I-NEXT: slli a4, a4, 1 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: srl a0, a0, a3 @@ -1927,38 +1924,38 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sw a4, 4(sp) ; RV32I-NEXT: sw a5, 8(sp) ; RV32I-NEXT: sw a6, 12(sp) -; RV32I-NEXT: lw a0, 8(a1) +; RV32I-NEXT: lw a0, 0(a1) ; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: lw a4, 0(a1) +; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: srli a5, a0, 16 -; RV32I-NEXT: srli a6, a0, 24 -; RV32I-NEXT: srli a7, a0, 8 +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: srli a6, a4, 24 +; RV32I-NEXT: srli a7, a4, 8 ; RV32I-NEXT: srli t0, a1, 16 ; RV32I-NEXT: srli t1, a1, 24 ; RV32I-NEXT: srli t2, a1, 8 -; RV32I-NEXT: srli t3, a4, 16 -; RV32I-NEXT: srli t4, a4, 24 -; RV32I-NEXT: srli t5, a4, 8 +; RV32I-NEXT: srli t3, a0, 16 +; RV32I-NEXT: srli t4, a0, 24 +; RV32I-NEXT: srli t5, a0, 8 ; RV32I-NEXT: srli t6, a3, 16 -; RV32I-NEXT: sb a0, 8(a2) +; RV32I-NEXT: sb a4, 8(a2) ; RV32I-NEXT: sb a7, 9(a2) ; RV32I-NEXT: sb a5, 10(a2) ; RV32I-NEXT: sb a6, 11(a2) -; RV32I-NEXT: srli a0, a3, 24 +; RV32I-NEXT: srli a4, a3, 24 ; RV32I-NEXT: sb a1, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) ; RV32I-NEXT: sb t1, 15(a2) ; RV32I-NEXT: srli a1, a3, 8 -; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb a0, 0(a2) ; RV32I-NEXT: sb t5, 1(a2) ; RV32I-NEXT: sb t3, 2(a2) ; RV32I-NEXT: sb t4, 3(a2) ; RV32I-NEXT: sb a3, 4(a2) ; RV32I-NEXT: sb a1, 5(a2) ; RV32I-NEXT: sb t6, 6(a2) -; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: sb a4, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 @@ -2065,13 +2062,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s7, s7, 24 ; RV64I-NEXT: or s5, s6, s5 ; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 4(a1) ; RV64I-NEXT: lbu s6, 5(a1) -; RV64I-NEXT: or s4, s7, s4 ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: or s4, s6, s4 ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) @@ -2088,8 +2085,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -2108,21 +2105,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: andi a1, a1, 24 ; RV64I-NEXT: add a1, s6, a1 ; RV64I-NEXT: andi a0, a4, 56 +; RV64I-NEXT: xori a5, a0, 63 ; RV64I-NEXT: ld a3, 0(a1) -; RV64I-NEXT: ld a5, 8(a1) -; RV64I-NEXT: ld a6, 16(a1) -; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld a6, 8(a1) +; RV64I-NEXT: ld a7, 16(a1) ; RV64I-NEXT: ld t0, 24(a1) -; RV64I-NEXT: srl a0, a5, a4 -; RV64I-NEXT: slli t1, a6, 1 +; RV64I-NEXT: srl a0, a6, a4 +; RV64I-NEXT: slli t1, a7, 1 ; RV64I-NEXT: srl a1, a3, a4 -; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: srl a3, a6, a4 -; RV64I-NEXT: slli a6, t0, 1 +; RV64I-NEXT: slli a6, a6, 1 +; RV64I-NEXT: srl a3, a7, a4 +; RV64I-NEXT: slli a7, t0, 1 ; RV64I-NEXT: srl t0, t0, a4 -; RV64I-NEXT: sll a4, t1, a7 -; RV64I-NEXT: sll a5, a5, a7 -; RV64I-NEXT: sll a6, a6, a7 +; RV64I-NEXT: sll a4, t1, a5 +; RV64I-NEXT: sll a6, a6, a5 +; RV64I-NEXT: sll a5, a7, a5 ; RV64I-NEXT: srli a7, t0, 56 ; RV64I-NEXT: srli t1, t0, 48 ; RV64I-NEXT: srli t2, t0, 40 @@ -2131,8 +2128,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli t5, t0, 16 ; RV64I-NEXT: srli t6, t0, 8 ; RV64I-NEXT: or a4, a0, a4 -; RV64I-NEXT: or a5, a1, a5 -; RV64I-NEXT: or a6, a3, a6 +; RV64I-NEXT: or a6, a1, a6 +; RV64I-NEXT: or a5, a3, a5 ; RV64I-NEXT: sb t3, 28(a2) ; RV64I-NEXT: sb t2, 29(a2) ; RV64I-NEXT: sb t1, 30(a2) @@ -2141,20 +2138,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb t6, 25(a2) ; RV64I-NEXT: sb t5, 26(a2) ; RV64I-NEXT: sb t4, 27(a2) -; RV64I-NEXT: srli a7, a6, 56 -; RV64I-NEXT: srli t0, a6, 48 -; RV64I-NEXT: srli t1, a6, 40 -; RV64I-NEXT: srli t2, a6, 32 -; RV64I-NEXT: srli t3, a6, 24 -; RV64I-NEXT: srli t4, a6, 16 -; RV64I-NEXT: srli a6, a6, 8 -; RV64I-NEXT: srli t5, a5, 56 -; RV64I-NEXT: srli t6, a5, 48 -; RV64I-NEXT: srli s0, a5, 40 -; RV64I-NEXT: srli s1, a5, 32 -; RV64I-NEXT: srli s2, a5, 24 -; RV64I-NEXT: srli s3, a5, 16 +; RV64I-NEXT: srli a7, a5, 56 +; RV64I-NEXT: srli t0, a5, 48 +; RV64I-NEXT: srli t1, a5, 40 +; RV64I-NEXT: srli t2, a5, 32 +; RV64I-NEXT: srli t3, a5, 24 +; RV64I-NEXT: srli t4, a5, 16 ; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: srli t5, a6, 56 +; RV64I-NEXT: srli t6, a6, 48 +; RV64I-NEXT: srli s0, a6, 40 +; RV64I-NEXT: srli s1, a6, 32 +; RV64I-NEXT: srli s2, a6, 24 +; RV64I-NEXT: srli s3, a6, 16 +; RV64I-NEXT: srli a6, a6, 8 ; RV64I-NEXT: srli s4, a4, 56 ; RV64I-NEXT: srli s5, a4, 48 ; RV64I-NEXT: srli s6, a4, 40 @@ -2164,7 +2161,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a7, 23(a2) ; RV64I-NEXT: srli a7, a4, 32 ; RV64I-NEXT: sb a3, 16(a2) -; RV64I-NEXT: sb a6, 17(a2) +; RV64I-NEXT: sb a5, 17(a2) ; RV64I-NEXT: sb t4, 18(a2) ; RV64I-NEXT: sb t3, 19(a2) ; RV64I-NEXT: srli a3, a4, 24 @@ -2172,10 +2169,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb s0, 5(a2) ; RV64I-NEXT: sb t6, 6(a2) ; RV64I-NEXT: sb t5, 7(a2) -; RV64I-NEXT: srli a6, a4, 16 +; RV64I-NEXT: srli a5, a4, 16 ; RV64I-NEXT: srli a4, a4, 8 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) +; RV64I-NEXT: sb a6, 1(a2) ; RV64I-NEXT: sb s3, 2(a2) ; RV64I-NEXT: sb s2, 3(a2) ; RV64I-NEXT: sb a7, 12(a2) @@ -2184,7 +2181,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb s4, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a6, 10(a2) +; RV64I-NEXT: sb a5, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -2543,13 +2540,13 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: slli s7, s7, 24 ; RV64I-NEXT: or s5, s6, s5 ; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 4(a1) ; RV64I-NEXT: lbu s6, 5(a1) -; RV64I-NEXT: or s4, s7, s4 ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: or s4, s6, s4 ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) @@ -2566,8 +2563,8 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -2587,24 +2584,24 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: andi a1, a1, 24 ; RV64I-NEXT: andi a0, a3, 32 ; RV64I-NEXT: add a1, s6, a1 -; RV64I-NEXT: ld a4, 0(a1) -; RV64I-NEXT: ld a5, 8(a1) -; RV64I-NEXT: ld a6, 16(a1) -; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: xori a4, a0, 63 +; RV64I-NEXT: ld a5, 0(a1) +; RV64I-NEXT: ld a6, 8(a1) +; RV64I-NEXT: ld a7, 16(a1) ; RV64I-NEXT: ld t0, 24(a1) -; RV64I-NEXT: srl a0, a5, a3 -; RV64I-NEXT: slli t1, a6, 1 -; RV64I-NEXT: srl a1, a4, a3 -; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: srl a4, a6, a3 -; RV64I-NEXT: slli a6, t0, 1 +; RV64I-NEXT: srl a0, a6, a3 +; RV64I-NEXT: slli t1, a7, 1 +; RV64I-NEXT: srl a1, a5, a3 +; RV64I-NEXT: slli a6, a6, 1 +; RV64I-NEXT: srl a5, a7, a3 +; RV64I-NEXT: slli a7, t0, 1 ; RV64I-NEXT: srl a3, t0, a3 -; RV64I-NEXT: sll t0, t1, a7 -; RV64I-NEXT: sll a5, a5, a7 -; RV64I-NEXT: sll a6, a6, a7 -; RV64I-NEXT: srli a7, a4, 24 -; RV64I-NEXT: srli t1, a4, 16 -; RV64I-NEXT: srli t2, a4, 8 +; RV64I-NEXT: sll t0, t1, a4 +; RV64I-NEXT: sll a6, a6, a4 +; RV64I-NEXT: sll a4, a7, a4 +; RV64I-NEXT: srli a7, a5, 24 +; RV64I-NEXT: srli t1, a5, 16 +; RV64I-NEXT: srli t2, a5, 8 ; RV64I-NEXT: srli t3, a3, 56 ; RV64I-NEXT: srli t4, a3, 48 ; RV64I-NEXT: srli t5, a3, 40 @@ -2616,19 +2613,19 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: srli s4, a1, 16 ; RV64I-NEXT: srli s5, a1, 8 ; RV64I-NEXT: srli s6, a0, 24 -; RV64I-NEXT: or a6, a4, a6 -; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: sb a5, 16(a2) ; RV64I-NEXT: sb t2, 17(a2) ; RV64I-NEXT: sb t1, 18(a2) ; RV64I-NEXT: sb a7, 19(a2) -; RV64I-NEXT: srli a4, a0, 16 +; RV64I-NEXT: srli a5, a0, 16 ; RV64I-NEXT: sb t6, 28(a2) ; RV64I-NEXT: sb t5, 29(a2) ; RV64I-NEXT: sb t4, 30(a2) ; RV64I-NEXT: sb t3, 31(a2) ; RV64I-NEXT: srli a7, a0, 8 ; RV64I-NEXT: or t0, a0, t0 -; RV64I-NEXT: or a5, a1, a5 +; RV64I-NEXT: or a6, a1, a6 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s2, 25(a2) ; RV64I-NEXT: sb s1, 26(a2) @@ -2639,16 +2636,16 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: sb s3, 3(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a7, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: sb a5, 10(a2) ; RV64I-NEXT: sb s6, 11(a2) -; RV64I-NEXT: srli a0, a6, 56 -; RV64I-NEXT: srli a1, a6, 48 -; RV64I-NEXT: srli a3, a6, 40 -; RV64I-NEXT: srli a4, a6, 32 -; RV64I-NEXT: srli a6, a5, 56 -; RV64I-NEXT: srli a7, a5, 48 -; RV64I-NEXT: srli t1, a5, 40 -; RV64I-NEXT: srli a5, a5, 32 +; RV64I-NEXT: srli a0, a4, 56 +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: srli a3, a4, 40 +; RV64I-NEXT: srli a4, a4, 32 +; RV64I-NEXT: srli a5, a6, 56 +; RV64I-NEXT: srli a7, a6, 48 +; RV64I-NEXT: srli t1, a6, 40 +; RV64I-NEXT: srli a6, a6, 32 ; RV64I-NEXT: srli t2, t0, 56 ; RV64I-NEXT: srli t3, t0, 48 ; RV64I-NEXT: srli t4, t0, 40 @@ -2657,10 +2654,10 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: sb a3, 21(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a0, 23(a2) -; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a6, 4(a2) ; RV64I-NEXT: sb t1, 5(a2) ; RV64I-NEXT: sb a7, 6(a2) -; RV64I-NEXT: sb a6, 7(a2) +; RV64I-NEXT: sb a5, 7(a2) ; RV64I-NEXT: sb t0, 12(a2) ; RV64I-NEXT: sb t4, 13(a2) ; RV64I-NEXT: sb t3, 14(a2) @@ -2797,13 +2794,13 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sw t0, 12(sp) ; RV32I-NEXT: sw t1, 16(sp) ; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(t6) -; RV32I-NEXT: lw a5, 20(t6) -; RV32I-NEXT: lw a7, 24(t6) ; RV32I-NEXT: lw a1, 0(t6) ; RV32I-NEXT: lw a0, 4(t6) ; RV32I-NEXT: lw a4, 8(t6) ; RV32I-NEXT: lw a3, 12(t6) +; RV32I-NEXT: lw a6, 16(t6) +; RV32I-NEXT: lw a5, 20(t6) +; RV32I-NEXT: lw a7, 24(t6) ; RV32I-NEXT: lw t0, 28(t6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 @@ -3001,9 +2998,9 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sd a5, 8(sp) ; RV64I-NEXT: sd a3, 16(sp) ; RV64I-NEXT: sd a0, 24(sp) -; RV64I-NEXT: ld a4, 16(t3) -; RV64I-NEXT: ld a0, 8(t3) ; RV64I-NEXT: ld a1, 0(t3) +; RV64I-NEXT: ld a0, 8(t3) +; RV64I-NEXT: ld a4, 16(t3) ; RV64I-NEXT: ld a3, 24(t3) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 @@ -3197,13 +3194,13 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sw t0, 12(sp) ; RV32I-NEXT: sw t1, 16(sp) ; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(t6) -; RV32I-NEXT: lw a5, 20(t6) -; RV32I-NEXT: lw a7, 24(t6) ; RV32I-NEXT: lw a1, 0(t6) ; RV32I-NEXT: lw a0, 4(t6) ; RV32I-NEXT: lw a4, 8(t6) ; RV32I-NEXT: lw a3, 12(t6) +; RV32I-NEXT: lw a6, 16(t6) +; RV32I-NEXT: lw a5, 20(t6) +; RV32I-NEXT: lw a7, 24(t6) ; RV32I-NEXT: lw t0, 28(t6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 @@ -3380,13 +3377,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s7, s7, 24 ; RV64I-NEXT: or s5, s6, s5 ; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 4(a1) ; RV64I-NEXT: lbu s6, 5(a1) -; RV64I-NEXT: or s4, s7, s4 ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: or s4, s6, s4 ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) @@ -3403,8 +3400,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -3423,11 +3420,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: andi a1, a1, 24 ; RV64I-NEXT: sub a1, s6, a1 ; RV64I-NEXT: andi a3, a0, 56 +; RV64I-NEXT: xori a3, a3, 63 ; RV64I-NEXT: ld a4, 0(a1) ; RV64I-NEXT: ld a5, 8(a1) ; RV64I-NEXT: ld a6, 16(a1) ; RV64I-NEXT: ld a1, 24(a1) -; RV64I-NEXT: xori a3, a3, 63 ; RV64I-NEXT: sll a7, a5, a0 ; RV64I-NEXT: srli t0, a4, 1 ; RV64I-NEXT: sll t1, a1, a0 @@ -3858,13 +3855,13 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: slli s7, s7, 24 ; RV64I-NEXT: or s5, s6, s5 ; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 4(a1) ; RV64I-NEXT: lbu s6, 5(a1) -; RV64I-NEXT: or s4, s7, s4 ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: or s4, s6, s4 ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) @@ -3881,8 +3878,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -3902,25 +3899,25 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: andi a1, a1, 24 ; RV64I-NEXT: andi a0, a3, 32 ; RV64I-NEXT: sub a1, s6, a1 -; RV64I-NEXT: ld a4, 0(a1) -; RV64I-NEXT: ld a5, 8(a1) -; RV64I-NEXT: ld a6, 16(a1) +; RV64I-NEXT: xori a4, a0, 63 +; RV64I-NEXT: ld a5, 0(a1) +; RV64I-NEXT: ld a6, 8(a1) +; RV64I-NEXT: ld a7, 16(a1) ; RV64I-NEXT: ld a1, 24(a1) -; RV64I-NEXT: xori a7, a0, 63 -; RV64I-NEXT: sll a0, a5, a3 -; RV64I-NEXT: srli t0, a4, 1 +; RV64I-NEXT: sll a0, a6, a3 +; RV64I-NEXT: srli t0, a5, 1 ; RV64I-NEXT: sll a1, a1, a3 -; RV64I-NEXT: srli t1, a6, 1 -; RV64I-NEXT: sll a6, a6, a3 -; RV64I-NEXT: srli a5, a5, 1 -; RV64I-NEXT: sll a3, a4, a3 -; RV64I-NEXT: srl a4, t0, a7 -; RV64I-NEXT: srl t0, t1, a7 -; RV64I-NEXT: srl a5, a5, a7 -; RV64I-NEXT: srli a7, a6, 56 -; RV64I-NEXT: srli t1, a6, 48 -; RV64I-NEXT: srli t2, a6, 40 -; RV64I-NEXT: srli t3, a6, 32 +; RV64I-NEXT: srli t1, a7, 1 +; RV64I-NEXT: sll a7, a7, a3 +; RV64I-NEXT: srli a6, a6, 1 +; RV64I-NEXT: sll a3, a5, a3 +; RV64I-NEXT: srl a5, t0, a4 +; RV64I-NEXT: srl t0, t1, a4 +; RV64I-NEXT: srl a4, a6, a4 +; RV64I-NEXT: srli a6, a7, 56 +; RV64I-NEXT: srli t1, a7, 48 +; RV64I-NEXT: srli t2, a7, 40 +; RV64I-NEXT: srli t3, a7, 32 ; RV64I-NEXT: srli t4, a1, 56 ; RV64I-NEXT: srli t5, a1, 48 ; RV64I-NEXT: srli t6, a1, 40 @@ -3933,19 +3930,19 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: srli s6, a3, 16 ; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: srli t0, a3, 8 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: srli a6, a0, 56 +; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: srli a7, a0, 56 ; RV64I-NEXT: sb t3, 20(a2) ; RV64I-NEXT: sb t2, 21(a2) ; RV64I-NEXT: sb t1, 22(a2) -; RV64I-NEXT: sb a7, 23(a2) -; RV64I-NEXT: srli a7, a0, 48 +; RV64I-NEXT: sb a6, 23(a2) +; RV64I-NEXT: srli a6, a0, 48 ; RV64I-NEXT: sb s0, 28(a2) ; RV64I-NEXT: sb t6, 29(a2) ; RV64I-NEXT: sb t5, 30(a2) ; RV64I-NEXT: sb t4, 31(a2) ; RV64I-NEXT: srli t1, a0, 40 -; RV64I-NEXT: or a4, a0, a4 +; RV64I-NEXT: or a5, a0, a5 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sb s4, 4(a2) ; RV64I-NEXT: sb s3, 5(a2) @@ -3957,18 +3954,18 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: sb s5, 3(a2) ; RV64I-NEXT: sb a0, 12(a2) ; RV64I-NEXT: sb t1, 13(a2) -; RV64I-NEXT: sb a7, 14(a2) -; RV64I-NEXT: sb a6, 15(a2) -; RV64I-NEXT: srli a0, a5, 24 -; RV64I-NEXT: srli a3, a5, 16 -; RV64I-NEXT: srli a6, a5, 8 +; RV64I-NEXT: sb a6, 14(a2) +; RV64I-NEXT: sb a7, 15(a2) +; RV64I-NEXT: srli a0, a4, 24 +; RV64I-NEXT: srli a3, a4, 16 +; RV64I-NEXT: srli a6, a4, 8 ; RV64I-NEXT: srli a7, a1, 24 ; RV64I-NEXT: srli t0, a1, 16 ; RV64I-NEXT: srli t1, a1, 8 -; RV64I-NEXT: srli t2, a4, 24 -; RV64I-NEXT: srli t3, a4, 16 -; RV64I-NEXT: srli t4, a4, 8 -; RV64I-NEXT: sb a5, 16(a2) +; RV64I-NEXT: srli t2, a5, 24 +; RV64I-NEXT: srli t3, a5, 16 +; RV64I-NEXT: srli t4, a5, 8 +; RV64I-NEXT: sb a4, 16(a2) ; RV64I-NEXT: sb a6, 17(a2) ; RV64I-NEXT: sb a3, 18(a2) ; RV64I-NEXT: sb a0, 19(a2) @@ -3976,7 +3973,7 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: sb t1, 25(a2) ; RV64I-NEXT: sb t0, 26(a2) ; RV64I-NEXT: sb a7, 27(a2) -; RV64I-NEXT: sb a4, 8(a2) +; RV64I-NEXT: sb a5, 8(a2) ; RV64I-NEXT: sb t4, 9(a2) ; RV64I-NEXT: sb t3, 10(a2) ; RV64I-NEXT: sb t2, 11(a2) @@ -4112,13 +4109,13 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV32I-NEXT: sw t0, 44(sp) ; RV32I-NEXT: sw t1, 48(sp) ; RV32I-NEXT: sw a5, 52(sp) -; RV32I-NEXT: lw a6, 16(t2) -; RV32I-NEXT: lw a5, 20(t2) -; RV32I-NEXT: lw a7, 24(t2) ; RV32I-NEXT: lw a1, 0(t2) ; RV32I-NEXT: lw a0, 4(t2) ; RV32I-NEXT: lw a4, 8(t2) ; RV32I-NEXT: lw a3, 12(t2) +; RV32I-NEXT: lw a6, 16(t2) +; RV32I-NEXT: lw a5, 20(t2) +; RV32I-NEXT: lw a7, 24(t2) ; RV32I-NEXT: lw t0, 28(t2) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 @@ -4316,9 +4313,9 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: sd a5, 40(sp) ; RV64I-NEXT: sd a3, 48(sp) ; RV64I-NEXT: sd a0, 56(sp) -; RV64I-NEXT: ld a4, 16(t2) -; RV64I-NEXT: ld a0, 8(t2) ; RV64I-NEXT: ld a1, 0(t2) +; RV64I-NEXT: ld a0, 8(t2) +; RV64I-NEXT: ld a4, 16(t2) ; RV64I-NEXT: ld a3, 24(t2) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 @@ -4512,13 +4509,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV32I-NEXT: sw t0, 44(sp) ; RV32I-NEXT: sw t1, 48(sp) ; RV32I-NEXT: sw a5, 52(sp) -; RV32I-NEXT: lw a6, 16(t2) -; RV32I-NEXT: lw a5, 20(t2) -; RV32I-NEXT: lw a7, 24(t2) ; RV32I-NEXT: lw a1, 0(t2) ; RV32I-NEXT: lw a0, 4(t2) ; RV32I-NEXT: lw a4, 8(t2) ; RV32I-NEXT: lw a3, 12(t2) +; RV32I-NEXT: lw a6, 16(t2) +; RV32I-NEXT: lw a5, 20(t2) +; RV32I-NEXT: lw a7, 24(t2) ; RV32I-NEXT: lw t0, 28(t2) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 @@ -4695,13 +4692,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s7, s7, 24 ; RV64I-NEXT: or s5, s6, s5 ; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 4(a1) ; RV64I-NEXT: lbu s6, 5(a1) -; RV64I-NEXT: or s4, s7, s4 ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: or s4, s6, s4 ; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, s7 @@ -4714,8 +4711,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -4739,21 +4736,21 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: andi a1, a1, 24 ; RV64I-NEXT: add a1, s6, a1 ; RV64I-NEXT: andi a0, a4, 56 +; RV64I-NEXT: xori a5, a0, 63 ; RV64I-NEXT: ld a3, 0(a1) -; RV64I-NEXT: ld a5, 8(a1) -; RV64I-NEXT: ld a6, 16(a1) -; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld a6, 8(a1) +; RV64I-NEXT: ld a7, 16(a1) ; RV64I-NEXT: ld t0, 24(a1) -; RV64I-NEXT: srl a0, a5, a4 -; RV64I-NEXT: slli t1, a6, 1 +; RV64I-NEXT: srl a0, a6, a4 +; RV64I-NEXT: slli t1, a7, 1 ; RV64I-NEXT: srl a1, a3, a4 -; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: srl a3, a6, a4 -; RV64I-NEXT: slli a6, t0, 1 +; RV64I-NEXT: slli a6, a6, 1 +; RV64I-NEXT: srl a3, a7, a4 +; RV64I-NEXT: slli a7, t0, 1 ; RV64I-NEXT: sra t0, t0, a4 -; RV64I-NEXT: sll a4, t1, a7 -; RV64I-NEXT: sll a5, a5, a7 -; RV64I-NEXT: sll a6, a6, a7 +; RV64I-NEXT: sll a4, t1, a5 +; RV64I-NEXT: sll a6, a6, a5 +; RV64I-NEXT: sll a5, a7, a5 ; RV64I-NEXT: srli a7, t0, 56 ; RV64I-NEXT: srli t1, t0, 48 ; RV64I-NEXT: srli t2, t0, 40 @@ -4762,8 +4759,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli t5, t0, 16 ; RV64I-NEXT: srli t6, t0, 8 ; RV64I-NEXT: or a4, a0, a4 -; RV64I-NEXT: or a5, a1, a5 -; RV64I-NEXT: or a6, a3, a6 +; RV64I-NEXT: or a6, a1, a6 +; RV64I-NEXT: or a5, a3, a5 ; RV64I-NEXT: sb t3, 28(a2) ; RV64I-NEXT: sb t2, 29(a2) ; RV64I-NEXT: sb t1, 30(a2) @@ -4772,20 +4769,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb t6, 25(a2) ; RV64I-NEXT: sb t5, 26(a2) ; RV64I-NEXT: sb t4, 27(a2) -; RV64I-NEXT: srli a7, a6, 56 -; RV64I-NEXT: srli t0, a6, 48 -; RV64I-NEXT: srli t1, a6, 40 -; RV64I-NEXT: srli t2, a6, 32 -; RV64I-NEXT: srli t3, a6, 24 -; RV64I-NEXT: srli t4, a6, 16 -; RV64I-NEXT: srli a6, a6, 8 -; RV64I-NEXT: srli t5, a5, 56 -; RV64I-NEXT: srli t6, a5, 48 -; RV64I-NEXT: srli s0, a5, 40 -; RV64I-NEXT: srli s1, a5, 32 -; RV64I-NEXT: srli s2, a5, 24 -; RV64I-NEXT: srli s3, a5, 16 +; RV64I-NEXT: srli a7, a5, 56 +; RV64I-NEXT: srli t0, a5, 48 +; RV64I-NEXT: srli t1, a5, 40 +; RV64I-NEXT: srli t2, a5, 32 +; RV64I-NEXT: srli t3, a5, 24 +; RV64I-NEXT: srli t4, a5, 16 ; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: srli t5, a6, 56 +; RV64I-NEXT: srli t6, a6, 48 +; RV64I-NEXT: srli s0, a6, 40 +; RV64I-NEXT: srli s1, a6, 32 +; RV64I-NEXT: srli s2, a6, 24 +; RV64I-NEXT: srli s3, a6, 16 +; RV64I-NEXT: srli a6, a6, 8 ; RV64I-NEXT: srli s4, a4, 56 ; RV64I-NEXT: srli s5, a4, 48 ; RV64I-NEXT: srli s6, a4, 40 @@ -4795,7 +4792,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a7, 23(a2) ; RV64I-NEXT: srli a7, a4, 32 ; RV64I-NEXT: sb a3, 16(a2) -; RV64I-NEXT: sb a6, 17(a2) +; RV64I-NEXT: sb a5, 17(a2) ; RV64I-NEXT: sb t4, 18(a2) ; RV64I-NEXT: sb t3, 19(a2) ; RV64I-NEXT: srli a3, a4, 24 @@ -4803,10 +4800,10 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb s0, 5(a2) ; RV64I-NEXT: sb t6, 6(a2) ; RV64I-NEXT: sb t5, 7(a2) -; RV64I-NEXT: srli a6, a4, 16 +; RV64I-NEXT: srli a5, a4, 16 ; RV64I-NEXT: srli a4, a4, 8 ; RV64I-NEXT: sb a1, 0(a2) -; RV64I-NEXT: sb a5, 1(a2) +; RV64I-NEXT: sb a6, 1(a2) ; RV64I-NEXT: sb s3, 2(a2) ; RV64I-NEXT: sb s2, 3(a2) ; RV64I-NEXT: sb a7, 12(a2) @@ -4815,7 +4812,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb s4, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: sb a6, 10(a2) +; RV64I-NEXT: sb a5, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) ; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload @@ -5175,13 +5172,13 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: slli s7, s7, 24 ; RV64I-NEXT: or s5, s6, s5 ; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 4(a1) ; RV64I-NEXT: lbu s6, 5(a1) -; RV64I-NEXT: or s4, s7, s4 ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: or s4, s6, s4 ; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, s7 @@ -5194,8 +5191,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -5220,24 +5217,24 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: andi a1, a1, 24 ; RV64I-NEXT: andi a0, a3, 32 ; RV64I-NEXT: add a1, s6, a1 -; RV64I-NEXT: ld a4, 0(a1) -; RV64I-NEXT: ld a5, 8(a1) -; RV64I-NEXT: ld a6, 16(a1) -; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: xori a4, a0, 63 +; RV64I-NEXT: ld a5, 0(a1) +; RV64I-NEXT: ld a6, 8(a1) +; RV64I-NEXT: ld a7, 16(a1) ; RV64I-NEXT: ld t0, 24(a1) -; RV64I-NEXT: srl a0, a5, a3 -; RV64I-NEXT: slli t1, a6, 1 -; RV64I-NEXT: srl a1, a4, a3 -; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: srl a4, a6, a3 -; RV64I-NEXT: slli a6, t0, 1 +; RV64I-NEXT: srl a0, a6, a3 +; RV64I-NEXT: slli t1, a7, 1 +; RV64I-NEXT: srl a1, a5, a3 +; RV64I-NEXT: slli a6, a6, 1 +; RV64I-NEXT: srl a5, a7, a3 +; RV64I-NEXT: slli a7, t0, 1 ; RV64I-NEXT: sra a3, t0, a3 -; RV64I-NEXT: sll t0, t1, a7 -; RV64I-NEXT: sll a5, a5, a7 -; RV64I-NEXT: sll a6, a6, a7 -; RV64I-NEXT: srli a7, a4, 24 -; RV64I-NEXT: srli t1, a4, 16 -; RV64I-NEXT: srli t2, a4, 8 +; RV64I-NEXT: sll t0, t1, a4 +; RV64I-NEXT: sll a6, a6, a4 +; RV64I-NEXT: sll a4, a7, a4 +; RV64I-NEXT: srli a7, a5, 24 +; RV64I-NEXT: srli t1, a5, 16 +; RV64I-NEXT: srli t2, a5, 8 ; RV64I-NEXT: srli t3, a3, 56 ; RV64I-NEXT: srli t4, a3, 48 ; RV64I-NEXT: srli t5, a3, 40 @@ -5249,19 +5246,19 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: srli s4, a1, 16 ; RV64I-NEXT: srli s5, a1, 8 ; RV64I-NEXT: srli s6, a0, 24 -; RV64I-NEXT: or a6, a4, a6 -; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: sb a5, 16(a2) ; RV64I-NEXT: sb t2, 17(a2) ; RV64I-NEXT: sb t1, 18(a2) ; RV64I-NEXT: sb a7, 19(a2) -; RV64I-NEXT: srli a4, a0, 16 +; RV64I-NEXT: srli a5, a0, 16 ; RV64I-NEXT: sb t6, 28(a2) ; RV64I-NEXT: sb t5, 29(a2) ; RV64I-NEXT: sb t4, 30(a2) ; RV64I-NEXT: sb t3, 31(a2) ; RV64I-NEXT: srli a7, a0, 8 ; RV64I-NEXT: or t0, a0, t0 -; RV64I-NEXT: or a5, a1, a5 +; RV64I-NEXT: or a6, a1, a6 ; RV64I-NEXT: sb a3, 24(a2) ; RV64I-NEXT: sb s2, 25(a2) ; RV64I-NEXT: sb s1, 26(a2) @@ -5272,16 +5269,16 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: sb s3, 3(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: sb a7, 9(a2) -; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: sb a5, 10(a2) ; RV64I-NEXT: sb s6, 11(a2) -; RV64I-NEXT: srli a0, a6, 56 -; RV64I-NEXT: srli a1, a6, 48 -; RV64I-NEXT: srli a3, a6, 40 -; RV64I-NEXT: srli a4, a6, 32 -; RV64I-NEXT: srli a6, a5, 56 -; RV64I-NEXT: srli a7, a5, 48 -; RV64I-NEXT: srli t1, a5, 40 -; RV64I-NEXT: srli a5, a5, 32 +; RV64I-NEXT: srli a0, a4, 56 +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: srli a3, a4, 40 +; RV64I-NEXT: srli a4, a4, 32 +; RV64I-NEXT: srli a5, a6, 56 +; RV64I-NEXT: srli a7, a6, 48 +; RV64I-NEXT: srli t1, a6, 40 +; RV64I-NEXT: srli a6, a6, 32 ; RV64I-NEXT: srli t2, t0, 56 ; RV64I-NEXT: srli t3, t0, 48 ; RV64I-NEXT: srli t4, t0, 40 @@ -5290,10 +5287,10 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: sb a3, 21(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a0, 23(a2) -; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: sb a6, 4(a2) ; RV64I-NEXT: sb t1, 5(a2) ; RV64I-NEXT: sb a7, 6(a2) -; RV64I-NEXT: sb a6, 7(a2) +; RV64I-NEXT: sb a5, 7(a2) ; RV64I-NEXT: sb t0, 12(a2) ; RV64I-NEXT: sb t4, 13(a2) ; RV64I-NEXT: sb t3, 14(a2) @@ -5431,13 +5428,13 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV32I-NEXT: sw t0, 12(sp) ; RV32I-NEXT: sw t1, 16(sp) ; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(s6) -; RV32I-NEXT: lw a5, 20(s6) -; RV32I-NEXT: lw a7, 24(s6) ; RV32I-NEXT: lw a1, 0(s6) ; RV32I-NEXT: lw a0, 4(s6) ; RV32I-NEXT: lw a4, 8(s6) ; RV32I-NEXT: lw a3, 12(s6) +; RV32I-NEXT: lw a6, 16(s6) +; RV32I-NEXT: lw a5, 20(s6) +; RV32I-NEXT: lw a7, 24(s6) ; RV32I-NEXT: lw t0, 28(s6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 @@ -5636,9 +5633,9 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: sd a7, 8(sp) ; RV64I-NEXT: sd a3, 16(sp) ; RV64I-NEXT: sd a1, 24(sp) -; RV64I-NEXT: ld a4, 16(t5) -; RV64I-NEXT: ld a0, 8(t5) ; RV64I-NEXT: ld a1, 0(t5) +; RV64I-NEXT: ld a0, 8(t5) +; RV64I-NEXT: ld a4, 16(t5) ; RV64I-NEXT: ld a3, 24(t5) ; RV64I-NEXT: srli a5, a4, 56 ; RV64I-NEXT: srli a6, a4, 48 @@ -5833,13 +5830,13 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV32I-NEXT: sw t0, 12(sp) ; RV32I-NEXT: sw t1, 16(sp) ; RV32I-NEXT: sw a5, 20(sp) -; RV32I-NEXT: lw a6, 16(s6) -; RV32I-NEXT: lw a5, 20(s6) -; RV32I-NEXT: lw a7, 24(s6) ; RV32I-NEXT: lw a1, 0(s6) ; RV32I-NEXT: lw a0, 4(s6) ; RV32I-NEXT: lw a4, 8(s6) ; RV32I-NEXT: lw a3, 12(s6) +; RV32I-NEXT: lw a6, 16(s6) +; RV32I-NEXT: lw a5, 20(s6) +; RV32I-NEXT: lw a7, 24(s6) ; RV32I-NEXT: lw t0, 28(s6) ; RV32I-NEXT: srli t1, a7, 24 ; RV32I-NEXT: srli t2, a7, 16 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index b2c130c2d7c10..f02ffa8951ad7 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -5,12 +5,12 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_4bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lb a0, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 @@ -28,26 +28,26 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_4bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 2(a0) -; RV32I-NEXT: lbu a5, 3(a0) -; RV32I-NEXT: lbu a0, 0(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: srli a3, a0, 24 @@ -66,12 +66,12 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_4bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lb a0, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 @@ -89,26 +89,26 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_4bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 2(a0) -; RV32I-NEXT: lbu a5, 3(a0) -; RV32I-NEXT: lbu a0, 0(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: srli a3, a0, 24 @@ -127,12 +127,12 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_4bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) ; RV64I-NEXT: lb a0, 3(a0) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a0, a0, 24 @@ -150,26 +150,26 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_4bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 2(a0) -; RV32I-NEXT: lbu a5, 3(a0) -; RV32I-NEXT: lbu a0, 0(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu a6, 1(a1) -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a0, 3(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: srli a3, a0, 24 @@ -215,20 +215,20 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 4(a1) ; RV64I-NEXT: lbu t1, 5(a1) -; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: lbu t2, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 @@ -253,39 +253,39 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_8bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: lbu a4, 6(a0) -; RV32I-NEXT: lbu a5, 7(a0) -; RV32I-NEXT: lbu a6, 4(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a3, a6 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a3, 4(a0) +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 6(a0) +; RV32I-NEXT: lbu a6, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a5, a4, a3 -; RV32I-NEXT: or a4, a1, a6 -; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a5, a5, a3 +; RV32I-NEXT: or a4, a1, a4 ; RV32I-NEXT: srl a1, a5, a4 +; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: bltz a3, .LBB3_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: j .LBB3_3 ; RV32I-NEXT: .LBB3_2: -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) +; RV32I-NEXT: lbu a6, 0(a0) +; RV32I-NEXT: lbu a7, 1(a0) ; RV32I-NEXT: lbu t0, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, t0 @@ -348,20 +348,20 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 4(a1) ; RV64I-NEXT: lbu t1, 5(a1) -; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: lbu t2, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 @@ -386,39 +386,39 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_8bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: lbu a4, 2(a0) -; RV32I-NEXT: lbu a5, 3(a0) -; RV32I-NEXT: lbu a6, 0(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: slli a4, a4, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a3, a6 -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 1(a1) -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a3, 0(a0) +; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: or a5, a4, a3 -; RV32I-NEXT: or a4, a1, a6 -; RV32I-NEXT: addi a3, a4, -32 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a5, a5, a3 +; RV32I-NEXT: or a4, a1, a4 ; RV32I-NEXT: sll a1, a5, a4 +; RV32I-NEXT: addi a3, a4, -32 ; RV32I-NEXT: bltz a3, .LBB4_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: j .LBB4_3 ; RV32I-NEXT: .LBB4_2: -; RV32I-NEXT: lbu a6, 5(a0) -; RV32I-NEXT: lbu a7, 4(a0) +; RV32I-NEXT: lbu a6, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) ; RV32I-NEXT: lbu t0, 6(a0) ; RV32I-NEXT: lbu a0, 7(a0) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a6, a7, a6 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, t0 @@ -481,20 +481,20 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: or a7, t2, t0 +; RV64I-NEXT: lbu t0, 4(a1) ; RV64I-NEXT: lbu t1, 5(a1) -; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: lbu t2, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: or t0, t1, t0 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: or a4, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a4, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a0, a0, a3 @@ -519,41 +519,40 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_8bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: lbu a4, 4(a0) +; RV32I-NEXT: lbu a3, 4(a0) +; RV32I-NEXT: lbu a4, 5(a0) ; RV32I-NEXT: lbu a5, 6(a0) ; RV32I-NEXT: lbu a6, 7(a0) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: lbu a7, 0(a1) -; RV32I-NEXT: lbu t0, 1(a1) -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: lbu a4, 2(a1) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 0(a1) +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t0, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, a4 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: or a1, a1, t0 ; RV32I-NEXT: slli a4, a5, 16 ; RV32I-NEXT: slli a5, a6, 24 ; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: or a4, a4, a3 ; RV32I-NEXT: or a3, a1, a7 -; RV32I-NEXT: addi a6, a3, -32 ; RV32I-NEXT: sra a1, a4, a3 +; RV32I-NEXT: addi a6, a3, -32 ; RV32I-NEXT: bltz a6, .LBB5_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a5, a5, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: srai a1, a5, 31 ; RV32I-NEXT: j .LBB5_3 ; RV32I-NEXT: .LBB5_2: -; RV32I-NEXT: lbu a5, 1(a0) -; RV32I-NEXT: lbu a6, 0(a0) +; RV32I-NEXT: lbu a5, 0(a0) +; RV32I-NEXT: lbu a6, 1(a0) ; RV32I-NEXT: lbu a7, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a7 @@ -615,53 +614,53 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t3, t3, 24 ; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 4(a1) ; RV64I-NEXT: lbu t2, 5(a1) -; RV64I-NEXT: or t0, t3, t0 ; RV64I-NEXT: lbu t3, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a5, a4, a3 ; RV64I-NEXT: or a4, a1, a6 -; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: srl a1, a5, a4 +; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: bltz a3, .LBB6_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB6_3 ; RV64I-NEXT: .LBB6_2: -; RV64I-NEXT: lbu a6, 1(a0) -; RV64I-NEXT: lbu a7, 2(a0) -; RV64I-NEXT: lbu t0, 3(a0) -; RV64I-NEXT: lbu t1, 0(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, a6, t1 -; RV64I-NEXT: lbu t1, 4(a0) -; RV64I-NEXT: lbu t2, 5(a0) -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: lbu a6, 0(a0) +; RV64I-NEXT: lbu a7, 1(a0) +; RV64I-NEXT: lbu t0, 2(a0) +; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 5(a0) +; RV64I-NEXT: lbu t2, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, t2 ; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: slli a5, a5, 1 -; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: srl a0, a0, a4 @@ -740,20 +739,20 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or t3, t4, t3 ; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t1, 0(a1) -; RV32I-NEXT: lbu t4, 1(a1) ; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: lbu t2, 2(a1) +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t2, 1(a1) +; RV32I-NEXT: lbu t4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t1, t4, t1 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: sw zero, 16(sp) ; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: or a1, a1, t4 ; RV32I-NEXT: mv t2, sp ; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a4, t0, a7 @@ -767,11 +766,11 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli a0, a1, 3 ; RV32I-NEXT: andi a3, a1, 31 ; RV32I-NEXT: andi a0, a0, 12 +; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: add a0, t2, a0 ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a6, 8(a0) -; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: lw a0, 12(a0) ; RV32I-NEXT: srl a7, a5, a1 ; RV32I-NEXT: slli t0, a6, 1 @@ -851,53 +850,53 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t3, t3, 24 ; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 4(a1) ; RV64I-NEXT: lbu t2, 5(a1) -; RV64I-NEXT: or t0, t3, t0 ; RV64I-NEXT: lbu t3, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, t1, a5 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a5, a4, a3 ; RV64I-NEXT: or a4, a1, a6 -; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: sll a1, a5, a4 +; RV64I-NEXT: addi a3, a4, -64 ; RV64I-NEXT: bltz a3, .LBB7_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: j .LBB7_3 ; RV64I-NEXT: .LBB7_2: -; RV64I-NEXT: lbu a6, 9(a0) -; RV64I-NEXT: lbu a7, 10(a0) -; RV64I-NEXT: lbu t0, 11(a0) -; RV64I-NEXT: lbu t1, 8(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, a6, t1 -; RV64I-NEXT: lbu t1, 12(a0) -; RV64I-NEXT: lbu t2, 13(a0) -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 14(a0) -; RV64I-NEXT: lbu a0, 15(a0) -; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or t1, t2, t1 +; RV64I-NEXT: lbu a6, 8(a0) +; RV64I-NEXT: lbu a7, 9(a0) +; RV64I-NEXT: lbu t0, 10(a0) +; RV64I-NEXT: lbu t1, 11(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: lbu t0, 12(a0) +; RV64I-NEXT: lbu t1, 13(a0) +; RV64I-NEXT: lbu t2, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli t1, t1, 8 +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, t2 ; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: not a7, a4 ; RV64I-NEXT: srli a5, a5, 1 -; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: sll a0, a0, a4 @@ -976,20 +975,20 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or t3, t4, t3 ; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: lbu t1, 0(a1) -; RV32I-NEXT: lbu t4, 1(a1) ; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: lbu t2, 2(a1) +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t2, 1(a1) +; RV32I-NEXT: lbu t4, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t1, t4, t1 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: or a1, a1, t4 ; RV32I-NEXT: addi t2, sp, 16 ; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: or a4, t0, a7 @@ -1003,12 +1002,12 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli a0, a1, 3 ; RV32I-NEXT: andi a3, a1, 31 ; RV32I-NEXT: andi a0, a0, 12 +; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: sub a0, t2, a0 ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a6, 8(a0) ; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: sll a7, a5, a1 ; RV32I-NEXT: srli t0, a4, 1 ; RV32I-NEXT: sll a0, a0, a1 @@ -1087,55 +1086,54 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli t3, t3, 24 ; RV64I-NEXT: or t1, t2, t1 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: or a7, t3, t0 +; RV64I-NEXT: lbu t0, 4(a1) ; RV64I-NEXT: lbu t2, 5(a1) -; RV64I-NEXT: or t0, t3, t0 ; RV64I-NEXT: lbu t3, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t2, t2, 8 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or t0, t2, t0 ; RV64I-NEXT: slli t3, t3, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a5, t1, a5 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: slli a4, a5, 32 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a4, a4, a3 ; RV64I-NEXT: or a3, a1, a6 -; RV64I-NEXT: addi a6, a3, -64 ; RV64I-NEXT: sra a1, a4, a3 +; RV64I-NEXT: addi a6, a3, -64 ; RV64I-NEXT: bltz a6, .LBB8_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sraiw a3, a5, 31 ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: sraiw a1, a5, 31 ; RV64I-NEXT: j .LBB8_3 ; RV64I-NEXT: .LBB8_2: -; RV64I-NEXT: lbu a5, 1(a0) -; RV64I-NEXT: lbu a6, 2(a0) -; RV64I-NEXT: lbu a7, 3(a0) -; RV64I-NEXT: lbu t0, 0(a0) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a5, t0 -; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: lbu t1, 5(a0) -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu a0, 7(a0) -; RV64I-NEXT: slli t1, t1, 8 -; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: lbu a6, 1(a0) +; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 5(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: not a6, a3 ; RV64I-NEXT: slli a4, a4, 1 -; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: srl a0, a0, a3 @@ -1209,26 +1207,26 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t1, t1, 8 ; RV32I-NEXT: or a4, t3, a4 ; RV32I-NEXT: or t3, t5, t4 -; RV32I-NEXT: lbu t4, 0(a1) -; RV32I-NEXT: lbu t5, 1(a1) ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: lbu t1, 2(a1) +; RV32I-NEXT: lbu t1, 0(a1) +; RV32I-NEXT: lbu t4, 1(a1) +; RV32I-NEXT: lbu t5, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or t1, t4, t1 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: or a1, a1, t5 ; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: mv a5, sp ; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or t1, a0, t2 +; RV32I-NEXT: or t2, a0, t2 ; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: or a6, a7, a6 ; RV32I-NEXT: or a4, t3, a4 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: or a7, t2, t0 +; RV32I-NEXT: or a1, a1, t1 ; RV32I-NEXT: sw a0, 16(sp) ; RV32I-NEXT: sw a0, 20(sp) ; RV32I-NEXT: sw a0, 24(sp) @@ -1240,11 +1238,11 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli a0, a1, 3 ; RV32I-NEXT: andi a3, a1, 31 ; RV32I-NEXT: andi a0, a0, 12 +; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: add a0, a5, a0 ; RV32I-NEXT: lw a4, 0(a0) ; RV32I-NEXT: lw a5, 4(a0) ; RV32I-NEXT: lw a6, 8(a0) -; RV32I-NEXT: xori a3, a3, 31 ; RV32I-NEXT: lw a0, 12(a0) ; RV32I-NEXT: srl a7, a5, a1 ; RV32I-NEXT: slli t0, a6, 1 @@ -1392,13 +1390,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s7, s7, 24 ; RV64I-NEXT: or s5, s6, s5 ; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 4(a1) ; RV64I-NEXT: lbu s6, 5(a1) -; RV64I-NEXT: or s4, s7, s4 ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: or s4, s6, s4 ; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, s7 @@ -1415,8 +1413,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t1, s0, t6 ; RV64I-NEXT: or t2, s5, s1 -; RV64I-NEXT: or t3, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t3, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a3, a3, 32 ; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -1434,11 +1432,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli a0, a1, 3 ; RV64I-NEXT: andi a3, a1, 63 ; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: xori a3, a3, 63 ; RV64I-NEXT: add a0, a6, a0 ; RV64I-NEXT: ld a4, 0(a0) ; RV64I-NEXT: ld a5, 8(a0) ; RV64I-NEXT: ld a6, 16(a0) -; RV64I-NEXT: xori a3, a3, 63 ; RV64I-NEXT: ld a0, 24(a0) ; RV64I-NEXT: srl a7, a5, a1 ; RV64I-NEXT: slli t0, a6, 1 @@ -1868,13 +1866,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s7, s7, 24 ; RV64I-NEXT: or s5, s6, s5 ; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 4(a1) ; RV64I-NEXT: lbu s6, 5(a1) -; RV64I-NEXT: or s4, s7, s4 ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: or s4, s6, s4 ; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, s7 @@ -1891,8 +1889,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t1, s0, t6 ; RV64I-NEXT: or t2, s5, s1 -; RV64I-NEXT: or t3, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t3, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a3, a3, 32 ; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -1910,12 +1908,12 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli a0, a1, 3 ; RV64I-NEXT: andi a3, a1, 63 ; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: xori a3, a3, 63 ; RV64I-NEXT: sub a0, a6, a0 ; RV64I-NEXT: ld a4, 0(a0) ; RV64I-NEXT: ld a5, 8(a0) ; RV64I-NEXT: ld a6, 16(a0) ; RV64I-NEXT: ld a0, 24(a0) -; RV64I-NEXT: xori a3, a3, 63 ; RV64I-NEXT: sll a7, a5, a1 ; RV64I-NEXT: srli t0, a4, 1 ; RV64I-NEXT: sll t1, a0, a1 @@ -2344,13 +2342,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s7, s7, 24 ; RV64I-NEXT: or s5, s6, s5 ; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: lbu s3, 4(a1) +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 4(a1) ; RV64I-NEXT: lbu s6, 5(a1) -; RV64I-NEXT: or s4, s7, s4 ; RV64I-NEXT: lbu s7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli s6, s6, 8 -; RV64I-NEXT: or s3, s6, s3 +; RV64I-NEXT: or s4, s6, s4 ; RV64I-NEXT: slli s7, s7, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, s7 @@ -2363,8 +2361,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a0, a0, t5 ; RV64I-NEXT: or t0, s0, t6 ; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s4, s2 -; RV64I-NEXT: or a1, a1, s3 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: slli a0, a0, 32 @@ -2387,11 +2385,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli a0, a1, 3 ; RV64I-NEXT: andi a3, a1, 63 ; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: xori a3, a3, 63 ; RV64I-NEXT: add a0, s6, a0 ; RV64I-NEXT: ld a4, 0(a0) ; RV64I-NEXT: ld a5, 8(a0) ; RV64I-NEXT: ld a6, 16(a0) -; RV64I-NEXT: xori a3, a3, 63 ; RV64I-NEXT: ld a0, 24(a0) ; RV64I-NEXT: srl a7, a5, a1 ; RV64I-NEXT: slli t0, a6, 1 diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index a30593d7d7afb..a496699f7e386 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -1713,8 +1713,8 @@ define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) { ; RV64-NEXT: mulhu a0, a0, a1 ; RV64-NEXT: srli a1, a0, 32 ; RV64-NEXT: snez a1, a1 -; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: sw a1, 0(a2) +; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: umulo3.i32: @@ -1733,8 +1733,8 @@ define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) { ; RV64ZBA-NEXT: mul a3, a0, a1 ; RV64ZBA-NEXT: srli a3, a3, 32 ; RV64ZBA-NEXT: snez a3, a3 -; RV64ZBA-NEXT: mulw a0, a0, a1 ; RV64ZBA-NEXT: sw a3, 0(a2) +; RV64ZBA-NEXT: mulw a0, a0, a1 ; RV64ZBA-NEXT: ret ; ; RV32ZICOND-LABEL: umulo3.i32: @@ -1753,8 +1753,8 @@ define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) { ; RV64ZICOND-NEXT: mulhu a0, a0, a1 ; RV64ZICOND-NEXT: srli a1, a0, 32 ; RV64ZICOND-NEXT: snez a1, a1 -; RV64ZICOND-NEXT: sext.w a0, a0 ; RV64ZICOND-NEXT: sw a1, 0(a2) +; RV64ZICOND-NEXT: sext.w a0, a0 ; RV64ZICOND-NEXT: ret %4 = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %0, i32 %1) %5 = extractvalue { i32, i1 } %4, 1 diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll index e761fcb736a87..f6b7f97f6525c 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll @@ -292,12 +292,12 @@ define ptr @lwuib(ptr %base, i64 %a, ptr %addr.1) { define ptr @ldia(ptr %base, ptr %addr.2, i64 %a) { ; RV32XTHEADMEMIDX-LABEL: ldia: ; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: lw a4, 4(a0) -; RV32XTHEADMEMIDX-NEXT: lw a5, 0(a0) +; RV32XTHEADMEMIDX-NEXT: lw a4, 0(a0) +; RV32XTHEADMEMIDX-NEXT: lw a5, 4(a0) ; RV32XTHEADMEMIDX-NEXT: addi a0, a0, -128 -; RV32XTHEADMEMIDX-NEXT: add a3, a4, a3 -; RV32XTHEADMEMIDX-NEXT: add a2, a5, a2 -; RV32XTHEADMEMIDX-NEXT: sltu a4, a2, a5 +; RV32XTHEADMEMIDX-NEXT: add a3, a5, a3 +; RV32XTHEADMEMIDX-NEXT: add a2, a4, a2 +; RV32XTHEADMEMIDX-NEXT: sltu a4, a2, a4 ; RV32XTHEADMEMIDX-NEXT: add a3, a3, a4 ; RV32XTHEADMEMIDX-NEXT: sw a2, 0(a1) ; RV32XTHEADMEMIDX-NEXT: sw a3, 4(a1) @@ -859,9 +859,9 @@ define i64 @lrd(ptr %a, i64 %b) { ; RV32XTHEADMEMIDX-LABEL: lrd: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: slli a2, a1, 3 +; RV32XTHEADMEMIDX-NEXT: add a2, a0, a2 +; RV32XTHEADMEMIDX-NEXT: lw a2, 4(a2) ; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: add a0, a0, a2 -; RV32XTHEADMEMIDX-NEXT: lw a2, 4(a0) ; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 ; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 ; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2 @@ -883,8 +883,8 @@ define i64 @lrd_2(ptr %a, i64 %b) { ; RV32XTHEADMEMIDX-LABEL: lrd_2: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: addi a2, a0, 96 -; RV32XTHEADMEMIDX-NEXT: th.lrw a2, a2, a1, 3 ; RV32XTHEADMEMIDX-NEXT: addi a0, a0, 100 +; RV32XTHEADMEMIDX-NEXT: th.lrw a2, a2, a1, 3 ; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 3 ; RV32XTHEADMEMIDX-NEXT: add a0, a2, a2 ; RV32XTHEADMEMIDX-NEXT: sltu a2, a0, a2 @@ -909,9 +909,9 @@ define i64 @lurd(ptr %a, i32 %b) { ; RV32XTHEADMEMIDX-LABEL: lurd: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: slli a2, a1, 3 +; RV32XTHEADMEMIDX-NEXT: add a2, a0, a2 +; RV32XTHEADMEMIDX-NEXT: lw a2, 4(a2) ; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: add a0, a0, a2 -; RV32XTHEADMEMIDX-NEXT: lw a2, 4(a0) ; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 ; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 ; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2 diff --git a/llvm/test/CodeGen/RISCV/xtheadmempair.ll b/llvm/test/CodeGen/RISCV/xtheadmempair.ll index 3525c40026064..7c940a3966217 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmempair.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmempair.ll @@ -57,14 +57,14 @@ define i64 @lwud(ptr %a) { define i64 @ldd(ptr %a) { ; RV32XTHEADMEMPAIR-LABEL: ldd: ; RV32XTHEADMEMPAIR: # %bb.0: -; RV32XTHEADMEMPAIR-NEXT: lw a1, 44(a0) -; RV32XTHEADMEMPAIR-NEXT: lw a2, 32(a0) -; RV32XTHEADMEMPAIR-NEXT: lw a3, 36(a0) -; RV32XTHEADMEMPAIR-NEXT: lw a0, 40(a0) -; RV32XTHEADMEMPAIR-NEXT: add a1, a3, a1 -; RV32XTHEADMEMPAIR-NEXT: add a0, a2, a0 -; RV32XTHEADMEMPAIR-NEXT: sltu a2, a0, a2 -; RV32XTHEADMEMPAIR-NEXT: add a1, a1, a2 +; RV32XTHEADMEMPAIR-NEXT: lw a1, 32(a0) +; RV32XTHEADMEMPAIR-NEXT: lw a2, 36(a0) +; RV32XTHEADMEMPAIR-NEXT: lw a3, 40(a0) +; RV32XTHEADMEMPAIR-NEXT: lw a0, 44(a0) +; RV32XTHEADMEMPAIR-NEXT: add a2, a2, a0 +; RV32XTHEADMEMPAIR-NEXT: add a0, a1, a3 +; RV32XTHEADMEMPAIR-NEXT: sltu a1, a0, a1 +; RV32XTHEADMEMPAIR-NEXT: add a1, a2, a1 ; RV32XTHEADMEMPAIR-NEXT: ret ; ; RV64XTHEADMEMPAIR-LABEL: ldd: @@ -245,10 +245,10 @@ define i64 @ld64(ptr %a) { define i128 @ld128(ptr %a) { ; RV32XTHEADMEMPAIR-LABEL: ld128: ; RV32XTHEADMEMPAIR: # %bb.0: -; RV32XTHEADMEMPAIR-NEXT: th.lwd a2, a3, (a1), 1, 3 -; RV32XTHEADMEMPAIR-NEXT: th.lwd a4, a5, (a1), 0, 3 -; RV32XTHEADMEMPAIR-NEXT: th.swd a2, a3, (a0), 1, 3 -; RV32XTHEADMEMPAIR-NEXT: th.swd a4, a5, (a0), 0, 3 +; RV32XTHEADMEMPAIR-NEXT: th.lwd a2, a3, (a1), 0, 3 +; RV32XTHEADMEMPAIR-NEXT: th.lwd a4, a5, (a1), 1, 3 +; RV32XTHEADMEMPAIR-NEXT: th.swd a4, a5, (a0), 1, 3 +; RV32XTHEADMEMPAIR-NEXT: th.swd a2, a3, (a0), 0, 3 ; RV32XTHEADMEMPAIR-NEXT: ret ; ; RV64XTHEADMEMPAIR-LABEL: ld128: @@ -279,10 +279,10 @@ define void @sd64(ptr %a, i64 %b) { define void @sd128(ptr %a, i128 %b) { ; RV32XTHEADMEMPAIR-LABEL: sd128: ; RV32XTHEADMEMPAIR: # %bb.0: -; RV32XTHEADMEMPAIR-NEXT: th.lwd a2, a3, (a1), 1, 3 -; RV32XTHEADMEMPAIR-NEXT: th.lwd a4, a5, (a1), 0, 3 -; RV32XTHEADMEMPAIR-NEXT: th.swd a2, a3, (a0), 1, 3 -; RV32XTHEADMEMPAIR-NEXT: th.swd a4, a5, (a0), 0, 3 +; RV32XTHEADMEMPAIR-NEXT: th.lwd a2, a3, (a1), 0, 3 +; RV32XTHEADMEMPAIR-NEXT: th.lwd a4, a5, (a1), 1, 3 +; RV32XTHEADMEMPAIR-NEXT: th.swd a4, a5, (a0), 1, 3 +; RV32XTHEADMEMPAIR-NEXT: th.swd a2, a3, (a0), 0, 3 ; RV32XTHEADMEMPAIR-NEXT: ret ; ; RV64XTHEADMEMPAIR-LABEL: sd128: diff --git a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll index d953d34e2d7b9..1c2eb5ecafbc4 100644 --- a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll +++ b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll @@ -137,13 +137,13 @@ define void @orarray100(ptr %a) { ; RV32-NEXT: addi a1, a1, 1 ; RV32-NEXT: add a4, a0, a4 ; RV32-NEXT: lw a5, 0(a4) -; RV32-NEXT: seqz a6, a1 -; RV32-NEXT: add a2, a2, a6 -; RV32-NEXT: xori a6, a1, 100 ; RV32-NEXT: orn a5, a5, a3 -; RV32-NEXT: or a6, a6, a2 ; RV32-NEXT: sw a5, 0(a4) -; RV32-NEXT: bnez a6, .LBB8_1 +; RV32-NEXT: seqz a4, a1 +; RV32-NEXT: xori a5, a1, 100 +; RV32-NEXT: add a2, a2, a4 +; RV32-NEXT: or a5, a5, a2 +; RV32-NEXT: bnez a5, .LBB8_1 ; RV32-NEXT: # %bb.2: # %for.cond.cleanup ; RV32-NEXT: ret ; @@ -180,16 +180,16 @@ for.body: define void @orarray3(ptr %a) { ; CHECK-LABEL: orarray3: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a0) -; CHECK-NEXT: lw a2, 4(a0) -; CHECK-NEXT: lw a3, 8(a0) -; CHECK-NEXT: lui a4, 1048560 -; CHECK-NEXT: orn a1, a1, a4 -; CHECK-NEXT: orn a2, a2, a4 -; CHECK-NEXT: orn a3, a3, a4 -; CHECK-NEXT: sw a1, 0(a0) -; CHECK-NEXT: sw a2, 4(a0) -; CHECK-NEXT: sw a3, 8(a0) +; CHECK-NEXT: lui a1, 1048560 +; CHECK-NEXT: lw a2, 0(a0) +; CHECK-NEXT: lw a3, 4(a0) +; CHECK-NEXT: lw a4, 8(a0) +; CHECK-NEXT: orn a2, a2, a1 +; CHECK-NEXT: orn a3, a3, a1 +; CHECK-NEXT: orn a1, a4, a1 +; CHECK-NEXT: sw a2, 0(a0) +; CHECK-NEXT: sw a3, 4(a0) +; CHECK-NEXT: sw a1, 8(a0) ; CHECK-NEXT: ret %1 = load i32, ptr %a, align 4 %or = or i32 %1, 65535 diff --git a/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll b/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll index b7d7d4c0945b6..d9f6e1a5820c8 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll @@ -10,11 +10,11 @@ define dso_local void @zdinx_asm(ptr nocapture noundef writeonly %a, double noun ; CHECK-LABEL: zdinx_asm: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mv a5, a4 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: #APP -; CHECK-NEXT: fsgnjx.d a2, a6, a4 +; CHECK-NEXT: fsgnjx.d a2, a2, a4 ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: sw a2, 8(a0) ; CHECK-NEXT: sw a3, 12(a0) @@ -30,11 +30,11 @@ define dso_local void @zdinx_asm_R(ptr nocapture noundef writeonly %a, double no ; CHECK-LABEL: zdinx_asm_R: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mv a5, a4 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: #APP -; CHECK-NEXT: fsgnjx.d a2, a6, a4 +; CHECK-NEXT: fsgnjx.d a2, a2, a4 ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: sw a2, 8(a0) ; CHECK-NEXT: sw a3, 12(a0) @@ -133,21 +133,15 @@ entry: define dso_local void @zdinx_asm_cr(ptr nocapture noundef writeonly %a, double noundef %b, double noundef %c) nounwind { ; CHECK-LABEL: zdinx_asm_cr: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-NEXT: mv a5, a4 -; CHECK-NEXT: mv s1, a2 ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: mv s0, a1 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: #APP -; CHECK-NEXT: fsgnjx.d a2, s0, a4 +; CHECK-NEXT: fsgnjx.d a2, a2, a4 ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: sw a2, 8(a0) ; CHECK-NEXT: sw a3, 12(a0) -; CHECK-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s1, 8(sp) # 4-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret entry: %arrayidx = getelementptr inbounds double, ptr %a, i32 1 @@ -189,21 +183,15 @@ entry: define dso_local void @zdinx_asm_cR(ptr nocapture noundef writeonly %a, double noundef %b, double noundef %c) nounwind { ; CHECK-LABEL: zdinx_asm_cR: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; CHECK-NEXT: mv a5, a4 -; CHECK-NEXT: mv s1, a2 ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: mv s0, a1 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: #APP -; CHECK-NEXT: fsgnjx.d a2, s0, a4 +; CHECK-NEXT: fsgnjx.d a2, a2, a4 ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: sw a2, 8(a0) ; CHECK-NEXT: sw a3, 12(a0) -; CHECK-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; CHECK-NEXT: lw s1, 8(sp) # 4-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret entry: %arrayidx = getelementptr inbounds double, ptr %a, i32 1 diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll index 9a312d9daca8d..05af53bf8a2b4 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll @@ -39,9 +39,9 @@ define void @foo2(ptr nocapture %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo2: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: mv a3, a2 -; RV32ZDINX-NEXT: addi a0, a0, 2047 ; RV32ZDINX-NEXT: mv a2, a1 ; RV32ZDINX-NEXT: fadd.d a2, a2, a2 +; RV32ZDINX-NEXT: addi a0, a0, 2047 ; RV32ZDINX-NEXT: sw a2, -3(a0) ; RV32ZDINX-NEXT: sw a3, 1(a0) ; RV32ZDINX-NEXT: ret @@ -49,9 +49,9 @@ define void @foo2(ptr nocapture %p, double %d) nounwind { ; RV32ZDINXUALIGNED-LABEL: foo2: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: mv a3, a2 -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 ; RV32ZDINXUALIGNED-NEXT: mv a2, a1 ; RV32ZDINXUALIGNED-NEXT: fadd.d a2, a2, a2 +; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 ; RV32ZDINXUALIGNED-NEXT: sw a2, -3(a0) ; RV32ZDINXUALIGNED-NEXT: sw a3, 1(a0) ; RV32ZDINXUALIGNED-NEXT: ret @@ -108,36 +108,36 @@ define void @foo4(ptr %p) nounwind { ; RV32ZDINX-LABEL: foo4: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: addi sp, sp, -16 -; RV32ZDINX-NEXT: addi a1, a0, 2047 -; RV32ZDINX-NEXT: lw a2, -3(a1) -; RV32ZDINX-NEXT: lw a3, 1(a1) ; RV32ZDINX-NEXT: sw a0, 8(sp) -; RV32ZDINX-NEXT: lui a0, %hi(d) -; RV32ZDINX-NEXT: sw a2, %lo(d)(a0) -; RV32ZDINX-NEXT: sw a3, %lo(d+4)(a0) +; RV32ZDINX-NEXT: addi a0, a0, 2047 +; RV32ZDINX-NEXT: lw a1, 1(a0) +; RV32ZDINX-NEXT: lw a0, -3(a0) +; RV32ZDINX-NEXT: lui a2, %hi(d) +; RV32ZDINX-NEXT: sw a0, %lo(d)(a2) +; RV32ZDINX-NEXT: sw a1, %lo(d+4)(a2) ; RV32ZDINX-NEXT: addi sp, sp, 16 ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo4: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, -16 -; RV32ZDINXUALIGNED-NEXT: addi a1, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: lw a2, -3(a1) -; RV32ZDINXUALIGNED-NEXT: lw a3, 1(a1) ; RV32ZDINXUALIGNED-NEXT: sw a0, 8(sp) -; RV32ZDINXUALIGNED-NEXT: lui a0, %hi(d) -; RV32ZDINXUALIGNED-NEXT: sw a2, %lo(d)(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, %lo(d+4)(a0) +; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 +; RV32ZDINXUALIGNED-NEXT: lw a1, 1(a0) +; RV32ZDINXUALIGNED-NEXT: lw a0, -3(a0) +; RV32ZDINXUALIGNED-NEXT: lui a2, %hi(d) +; RV32ZDINXUALIGNED-NEXT: sw a0, %lo(d)(a2) +; RV32ZDINXUALIGNED-NEXT: sw a1, %lo(d+4)(a2) ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, 16 ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo4: ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: addi sp, sp, -16 -; RV64ZDINX-NEXT: ld a1, 2044(a0) ; RV64ZDINX-NEXT: sd a0, 8(sp) -; RV64ZDINX-NEXT: lui a0, %hi(d) -; RV64ZDINX-NEXT: sd a1, %lo(d)(a0) +; RV64ZDINX-NEXT: ld a0, 2044(a0) +; RV64ZDINX-NEXT: lui a1, %hi(d) +; RV64ZDINX-NEXT: sd a0, %lo(d)(a1) ; RV64ZDINX-NEXT: addi sp, sp, 16 ; RV64ZDINX-NEXT: ret entry: @@ -184,10 +184,10 @@ define void @foo6(ptr %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo6: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: mv a3, a2 -; RV32ZDINX-NEXT: lui a2, %hi(.LCPI5_0) -; RV32ZDINX-NEXT: lw a4, %lo(.LCPI5_0)(a2) -; RV32ZDINX-NEXT: lw a5, %lo(.LCPI5_0+4)(a2) ; RV32ZDINX-NEXT: mv a2, a1 +; RV32ZDINX-NEXT: lui a1, %hi(.LCPI5_0) +; RV32ZDINX-NEXT: lw a4, %lo(.LCPI5_0)(a1) +; RV32ZDINX-NEXT: lw a5, %lo(.LCPI5_0+4)(a1) ; RV32ZDINX-NEXT: fadd.d a2, a2, a4 ; RV32ZDINX-NEXT: addi a0, a0, 2047 ; RV32ZDINX-NEXT: sw a2, -3(a0) @@ -197,10 +197,10 @@ define void @foo6(ptr %p, double %d) nounwind { ; RV32ZDINXUALIGNED-LABEL: foo6: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: mv a3, a2 -; RV32ZDINXUALIGNED-NEXT: lui a2, %hi(.LCPI5_0) -; RV32ZDINXUALIGNED-NEXT: lw a4, %lo(.LCPI5_0)(a2) -; RV32ZDINXUALIGNED-NEXT: lw a5, %lo(.LCPI5_0+4)(a2) ; RV32ZDINXUALIGNED-NEXT: mv a2, a1 +; RV32ZDINXUALIGNED-NEXT: lui a1, %hi(.LCPI5_0) +; RV32ZDINXUALIGNED-NEXT: lw a4, %lo(.LCPI5_0)(a1) +; RV32ZDINXUALIGNED-NEXT: lw a5, %lo(.LCPI5_0+4)(a1) ; RV32ZDINXUALIGNED-NEXT: fadd.d a2, a2, a4 ; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 ; RV32ZDINXUALIGNED-NEXT: sw a2, -3(a0) @@ -226,10 +226,10 @@ define void @foo7(ptr nocapture %p) nounwind { ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: addi sp, sp, -16 ; RV32ZDINX-NEXT: lui a1, %hi(d) -; RV32ZDINX-NEXT: lw a2, %lo(d+4)(a1) -; RV32ZDINX-NEXT: addi a1, a1, %lo(d) -; RV32ZDINX-NEXT: sw a2, 8(sp) -; RV32ZDINX-NEXT: lw a1, 8(a1) +; RV32ZDINX-NEXT: addi a2, a1, %lo(d) +; RV32ZDINX-NEXT: lw a1, %lo(d+4)(a1) +; RV32ZDINX-NEXT: sw a1, 8(sp) +; RV32ZDINX-NEXT: lw a1, 8(a2) ; RV32ZDINX-NEXT: sw a1, 12(sp) ; RV32ZDINX-NEXT: lw a2, 8(sp) ; RV32ZDINX-NEXT: lw a3, 12(sp) @@ -254,8 +254,8 @@ define void @foo7(ptr nocapture %p) nounwind { ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: lui a1, %hi(d) ; RV64ZDINX-NEXT: addi a2, a1, %lo(d) -; RV64ZDINX-NEXT: lwu a2, 8(a2) ; RV64ZDINX-NEXT: lwu a1, %lo(d+4)(a1) +; RV64ZDINX-NEXT: lwu a2, 8(a2) ; RV64ZDINX-NEXT: slli a2, a2, 32 ; RV64ZDINX-NEXT: or a1, a2, a1 ; RV64ZDINX-NEXT: sd a1, 2044(a0) @@ -272,45 +272,45 @@ define void @foo8(ptr %p) nounwind { ; RV32ZDINX-LABEL: foo8: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: addi sp, sp, -16 -; RV32ZDINX-NEXT: addi a1, a0, 2047 -; RV32ZDINX-NEXT: lw a2, -3(a1) -; RV32ZDINX-NEXT: lw a3, 1(a1) ; RV32ZDINX-NEXT: sw a0, 8(sp) -; RV32ZDINX-NEXT: sw a2, 0(sp) -; RV32ZDINX-NEXT: sw a3, 4(sp) +; RV32ZDINX-NEXT: addi a0, a0, 2047 +; RV32ZDINX-NEXT: lw a1, 1(a0) +; RV32ZDINX-NEXT: lw a0, -3(a0) +; RV32ZDINX-NEXT: lui a2, %hi(d) +; RV32ZDINX-NEXT: addi a3, a2, %lo(d) +; RV32ZDINX-NEXT: sw a0, 0(sp) +; RV32ZDINX-NEXT: sw a1, 4(sp) ; RV32ZDINX-NEXT: lw a0, 4(sp) -; RV32ZDINX-NEXT: lui a1, %hi(d) -; RV32ZDINX-NEXT: addi a2, a1, %lo(d) -; RV32ZDINX-NEXT: sw a0, 8(a2) +; RV32ZDINX-NEXT: sw a0, 8(a3) ; RV32ZDINX-NEXT: lw a0, 0(sp) -; RV32ZDINX-NEXT: sw a0, %lo(d+4)(a1) +; RV32ZDINX-NEXT: sw a0, %lo(d+4)(a2) ; RV32ZDINX-NEXT: addi sp, sp, 16 ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo8: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, -16 -; RV32ZDINXUALIGNED-NEXT: addi a1, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: lw a2, -3(a1) -; RV32ZDINXUALIGNED-NEXT: lw a3, 1(a1) ; RV32ZDINXUALIGNED-NEXT: sw a0, 8(sp) -; RV32ZDINXUALIGNED-NEXT: lui a0, %hi(d) -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, %lo(d) -; RV32ZDINXUALIGNED-NEXT: sw a2, 4(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 8(a0) +; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 +; RV32ZDINXUALIGNED-NEXT: lw a1, 1(a0) +; RV32ZDINXUALIGNED-NEXT: lw a0, -3(a0) +; RV32ZDINXUALIGNED-NEXT: lui a2, %hi(d) +; RV32ZDINXUALIGNED-NEXT: addi a2, a2, %lo(d) +; RV32ZDINXUALIGNED-NEXT: sw a0, 4(a2) +; RV32ZDINXUALIGNED-NEXT: sw a1, 8(a2) ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, 16 ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo8: ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: addi sp, sp, -16 -; RV64ZDINX-NEXT: ld a1, 2044(a0) ; RV64ZDINX-NEXT: sd a0, 8(sp) -; RV64ZDINX-NEXT: lui a0, %hi(d) -; RV64ZDINX-NEXT: addi a2, a0, %lo(d) -; RV64ZDINX-NEXT: sw a1, %lo(d+4)(a0) -; RV64ZDINX-NEXT: srli a1, a1, 32 -; RV64ZDINX-NEXT: sw a1, 8(a2) +; RV64ZDINX-NEXT: ld a0, 2044(a0) +; RV64ZDINX-NEXT: lui a1, %hi(d) +; RV64ZDINX-NEXT: addi a2, a1, %lo(d) +; RV64ZDINX-NEXT: sw a0, %lo(d+4)(a1) +; RV64ZDINX-NEXT: srli a0, a0, 32 +; RV64ZDINX-NEXT: sw a0, 8(a2) ; RV64ZDINX-NEXT: addi sp, sp, 16 ; RV64ZDINX-NEXT: ret entry: @@ -358,11 +358,11 @@ define void @foo9(ptr nocapture %p) nounwind { ; RV64ZDINX-LABEL: foo9: ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: lui a1, %hi(e) -; RV64ZDINX-NEXT: addi a2, a1, %lo(e) -; RV64ZDINX-NEXT: lwu a2, 4(a2) -; RV64ZDINX-NEXT: lwu a1, %lo(e)(a1) -; RV64ZDINX-NEXT: slli a2, a2, 32 -; RV64ZDINX-NEXT: or a1, a2, a1 +; RV64ZDINX-NEXT: lwu a2, %lo(e)(a1) +; RV64ZDINX-NEXT: addi a1, a1, %lo(e) +; RV64ZDINX-NEXT: lwu a1, 4(a1) +; RV64ZDINX-NEXT: slli a1, a1, 32 +; RV64ZDINX-NEXT: or a1, a1, a2 ; RV64ZDINX-NEXT: sd a1, 2044(a0) ; RV64ZDINX-NEXT: ret entry: @@ -380,41 +380,41 @@ define void @foo10(ptr %p) nounwind { ; RV32ZDINX-NEXT: lw a2, -3(a1) ; RV32ZDINX-NEXT: lw a3, 1(a1) ; RV32ZDINX-NEXT: sw a0, 8(sp) +; RV32ZDINX-NEXT: lui a0, %hi(e) ; RV32ZDINX-NEXT: sw a2, 0(sp) ; RV32ZDINX-NEXT: sw a3, 4(sp) -; RV32ZDINX-NEXT: lw a0, 4(sp) -; RV32ZDINX-NEXT: lui a1, %hi(e) -; RV32ZDINX-NEXT: addi a2, a1, %lo(e) -; RV32ZDINX-NEXT: sw a0, 4(a2) -; RV32ZDINX-NEXT: lw a0, 0(sp) -; RV32ZDINX-NEXT: sw a0, %lo(e)(a1) +; RV32ZDINX-NEXT: addi a1, a0, %lo(e) +; RV32ZDINX-NEXT: lw a2, 4(sp) +; RV32ZDINX-NEXT: sw a2, 4(a1) +; RV32ZDINX-NEXT: lw a1, 0(sp) +; RV32ZDINX-NEXT: sw a1, %lo(e)(a0) ; RV32ZDINX-NEXT: addi sp, sp, 16 ; RV32ZDINX-NEXT: ret ; ; RV32ZDINXUALIGNED-LABEL: foo10: ; RV32ZDINXUALIGNED: # %bb.0: # %entry ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, -16 -; RV32ZDINXUALIGNED-NEXT: addi a1, a0, 2047 -; RV32ZDINXUALIGNED-NEXT: lw a2, -3(a1) -; RV32ZDINXUALIGNED-NEXT: lw a3, 1(a1) ; RV32ZDINXUALIGNED-NEXT: sw a0, 8(sp) -; RV32ZDINXUALIGNED-NEXT: lui a0, %hi(e) -; RV32ZDINXUALIGNED-NEXT: addi a0, a0, %lo(e) -; RV32ZDINXUALIGNED-NEXT: sw a2, 0(a0) -; RV32ZDINXUALIGNED-NEXT: sw a3, 4(a0) +; RV32ZDINXUALIGNED-NEXT: addi a0, a0, 2047 +; RV32ZDINXUALIGNED-NEXT: lw a1, 1(a0) +; RV32ZDINXUALIGNED-NEXT: lw a0, -3(a0) +; RV32ZDINXUALIGNED-NEXT: lui a2, %hi(e) +; RV32ZDINXUALIGNED-NEXT: addi a2, a2, %lo(e) +; RV32ZDINXUALIGNED-NEXT: sw a0, 0(a2) +; RV32ZDINXUALIGNED-NEXT: sw a1, 4(a2) ; RV32ZDINXUALIGNED-NEXT: addi sp, sp, 16 ; RV32ZDINXUALIGNED-NEXT: ret ; ; RV64ZDINX-LABEL: foo10: ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: addi sp, sp, -16 -; RV64ZDINX-NEXT: ld a1, 2044(a0) ; RV64ZDINX-NEXT: sd a0, 8(sp) -; RV64ZDINX-NEXT: lui a0, %hi(e) -; RV64ZDINX-NEXT: sw a1, %lo(e)(a0) -; RV64ZDINX-NEXT: addi a0, a0, %lo(e) -; RV64ZDINX-NEXT: srli a1, a1, 32 -; RV64ZDINX-NEXT: sw a1, 4(a0) +; RV64ZDINX-NEXT: ld a0, 2044(a0) +; RV64ZDINX-NEXT: lui a1, %hi(e) +; RV64ZDINX-NEXT: sw a0, %lo(e)(a1) +; RV64ZDINX-NEXT: addi a1, a1, %lo(e) +; RV64ZDINX-NEXT: srli a0, a0, 32 +; RV64ZDINX-NEXT: sw a0, 4(a1) ; RV64ZDINX-NEXT: addi sp, sp, 16 ; RV64ZDINX-NEXT: ret entry: @@ -521,10 +521,10 @@ define double @foo13(ptr nocapture %p) nounwind { ; RV64ZDINX-LABEL: foo13: ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: lui a0, %hi(f) -; RV64ZDINX-NEXT: lwu a1, %lo(f+8)(a0) -; RV64ZDINX-NEXT: lwu a0, %lo(f+4)(a0) -; RV64ZDINX-NEXT: slli a1, a1, 32 -; RV64ZDINX-NEXT: or a0, a1, a0 +; RV64ZDINX-NEXT: lwu a1, %lo(f+4)(a0) +; RV64ZDINX-NEXT: lwu a0, %lo(f+8)(a0) +; RV64ZDINX-NEXT: slli a0, a0, 32 +; RV64ZDINX-NEXT: or a0, a0, a1 ; RV64ZDINX-NEXT: ret entry: %add.ptr = getelementptr inbounds i8, ptr @f, i64 4 diff --git a/llvm/test/CodeGen/Thumb2/bf16-pcs.ll b/llvm/test/CodeGen/Thumb2/bf16-pcs.ll new file mode 100644 index 0000000000000..2ffe420a20520 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/bf16-pcs.ll @@ -0,0 +1,388 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=armv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16 | FileCheck %s --check-prefix=LE +; RUN: llc -mtriple=armebv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16 | FileCheck %s --check-prefix=BE +; RUN: llc -mtriple=armv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16,+bf16 | FileCheck %s --check-prefix=LE-BF16 +; RUN: llc -mtriple=armebv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16,+bf16 | FileCheck %s --check-prefix=BE-BF16 + +;; Global ISel successfully generates code for some functions for little-endian +;; without +bf16, and falls back to SelectionDAG in all others. +; RUN: llc -mtriple=armv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16 -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefix=LE-GISEL +; RUN: llc -mtriple=armebv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16 -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefix=BE +; RUN: llc -mtriple=armv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16,+bf16 -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefix=LE-BF16 +; RUN: llc -mtriple=armebv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16,+bf16 -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefix=BE-BF16 + +define arm_aapcscc bfloat @callee_soft_bfloat_in_reg(bfloat %f) { +; LE-LABEL: callee_soft_bfloat_in_reg: +; LE: @ %bb.0: @ %entry +; LE-NEXT: bx lr +; +; BE-LABEL: callee_soft_bfloat_in_reg: +; BE: @ %bb.0: @ %entry +; BE-NEXT: bx lr +; +; LE-BF16-LABEL: callee_soft_bfloat_in_reg: +; LE-BF16: @ %bb.0: @ %entry +; LE-BF16-NEXT: .pad #4 +; LE-BF16-NEXT: sub sp, #4 +; LE-BF16-NEXT: strh.w r0, [sp, #2] +; LE-BF16-NEXT: ldrh.w r0, [sp, #2] +; LE-BF16-NEXT: add sp, #4 +; LE-BF16-NEXT: bx lr +; +; BE-BF16-LABEL: callee_soft_bfloat_in_reg: +; BE-BF16: @ %bb.0: @ %entry +; BE-BF16-NEXT: .pad #4 +; BE-BF16-NEXT: sub sp, #4 +; BE-BF16-NEXT: strh.w r0, [sp, #2] +; BE-BF16-NEXT: ldrh.w r0, [sp, #2] +; BE-BF16-NEXT: add sp, #4 +; BE-BF16-NEXT: bx lr +; +; LE-GISEL-LABEL: callee_soft_bfloat_in_reg: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: bx lr +entry: + ret bfloat %f +} + +define void @caller_soft_bfloat_in_reg() { +; LE-LABEL: caller_soft_bfloat_in_reg: +; LE: @ %bb.0: @ %entry +; LE-NEXT: .save {r7, lr} +; LE-NEXT: push {r7, lr} +; LE-NEXT: mov.w r0, #16256 +; LE-NEXT: bl callee_soft_bfloat_in_reg +; LE-NEXT: pop {r7, pc} +; +; BE-LABEL: caller_soft_bfloat_in_reg: +; BE: @ %bb.0: @ %entry +; BE-NEXT: .save {r7, lr} +; BE-NEXT: push {r7, lr} +; BE-NEXT: mov.w r0, #16256 +; BE-NEXT: bl callee_soft_bfloat_in_reg +; BE-NEXT: pop {r7, pc} +; +; LE-BF16-LABEL: caller_soft_bfloat_in_reg: +; LE-BF16: @ %bb.0: @ %entry +; LE-BF16-NEXT: .save {r7, lr} +; LE-BF16-NEXT: push {r7, lr} +; LE-BF16-NEXT: mov.w r0, #16256 +; LE-BF16-NEXT: bl callee_soft_bfloat_in_reg +; LE-BF16-NEXT: pop {r7, pc} +; +; BE-BF16-LABEL: caller_soft_bfloat_in_reg: +; BE-BF16: @ %bb.0: @ %entry +; BE-BF16-NEXT: .save {r7, lr} +; BE-BF16-NEXT: push {r7, lr} +; BE-BF16-NEXT: mov.w r0, #16256 +; BE-BF16-NEXT: bl callee_soft_bfloat_in_reg +; BE-BF16-NEXT: pop {r7, pc} +; +; LE-GISEL-LABEL: caller_soft_bfloat_in_reg: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: .save {r7, lr} +; LE-GISEL-NEXT: push {r7, lr} +; LE-GISEL-NEXT: mov.w r0, #16256 +; LE-GISEL-NEXT: bl callee_soft_bfloat_in_reg +; LE-GISEL-NEXT: pop {r7, pc} +entry: + %ret = call arm_aapcscc bfloat @callee_soft_bfloat_in_reg(bfloat 1.0) + ret void +} + +define arm_aapcscc bfloat @callee_soft_bfloat_on_stack(float %r0, float %r1, float %r2, float %r3, bfloat %f) { +; LE-LABEL: callee_soft_bfloat_on_stack: +; LE: @ %bb.0: @ %entry +; LE-NEXT: ldr r0, [sp] +; LE-NEXT: bx lr +; +; BE-LABEL: callee_soft_bfloat_on_stack: +; BE: @ %bb.0: @ %entry +; BE-NEXT: ldr r0, [sp] +; BE-NEXT: bx lr +; +; LE-BF16-LABEL: callee_soft_bfloat_on_stack: +; LE-BF16: @ %bb.0: @ %entry +; LE-BF16-NEXT: ldrh.w r0, [sp] +; LE-BF16-NEXT: bx lr +; +; BE-BF16-LABEL: callee_soft_bfloat_on_stack: +; BE-BF16: @ %bb.0: @ %entry +; BE-BF16-NEXT: ldrh.w r0, [sp, #2] +; BE-BF16-NEXT: bx lr +; +; LE-GISEL-LABEL: callee_soft_bfloat_on_stack: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: mov r0, sp +; LE-GISEL-NEXT: ldr r0, [r0] +; LE-GISEL-NEXT: bx lr +entry: + ret bfloat %f +} + +define void @caller_soft_bfloat_on_stack() { +; LE-LABEL: caller_soft_bfloat_on_stack: +; LE: @ %bb.0: @ %entry +; LE-NEXT: .save {r7, lr} +; LE-NEXT: push {r7, lr} +; LE-NEXT: .pad #8 +; LE-NEXT: sub sp, #8 +; LE-NEXT: mov.w r0, #16256 +; LE-NEXT: str r0, [sp] +; LE-NEXT: bl callee_soft_bfloat_on_stack +; LE-NEXT: add sp, #8 +; LE-NEXT: pop {r7, pc} +; +; BE-LABEL: caller_soft_bfloat_on_stack: +; BE: @ %bb.0: @ %entry +; BE-NEXT: .save {r7, lr} +; BE-NEXT: push {r7, lr} +; BE-NEXT: .pad #8 +; BE-NEXT: sub sp, #8 +; BE-NEXT: mov.w r0, #16256 +; BE-NEXT: str r0, [sp] +; BE-NEXT: bl callee_soft_bfloat_on_stack +; BE-NEXT: add sp, #8 +; BE-NEXT: pop {r7, pc} +; +; LE-BF16-LABEL: caller_soft_bfloat_on_stack: +; LE-BF16: @ %bb.0: @ %entry +; LE-BF16-NEXT: .save {r7, lr} +; LE-BF16-NEXT: push {r7, lr} +; LE-BF16-NEXT: .pad #8 +; LE-BF16-NEXT: sub sp, #8 +; LE-BF16-NEXT: mov.w r0, #16256 +; LE-BF16-NEXT: str r0, [sp] +; LE-BF16-NEXT: bl callee_soft_bfloat_on_stack +; LE-BF16-NEXT: add sp, #8 +; LE-BF16-NEXT: pop {r7, pc} +; +; BE-BF16-LABEL: caller_soft_bfloat_on_stack: +; BE-BF16: @ %bb.0: @ %entry +; BE-BF16-NEXT: .save {r7, lr} +; BE-BF16-NEXT: push {r7, lr} +; BE-BF16-NEXT: .pad #8 +; BE-BF16-NEXT: sub sp, #8 +; BE-BF16-NEXT: mov.w r0, #16256 +; BE-BF16-NEXT: str r0, [sp] +; BE-BF16-NEXT: bl callee_soft_bfloat_on_stack +; BE-BF16-NEXT: add sp, #8 +; BE-BF16-NEXT: pop {r7, pc} +; +; LE-GISEL-LABEL: caller_soft_bfloat_on_stack: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: .save {r7, lr} +; LE-GISEL-NEXT: push {r7, lr} +; LE-GISEL-NEXT: .pad #8 +; LE-GISEL-NEXT: sub sp, #8 +; LE-GISEL-NEXT: mov.w r0, #16256 +; LE-GISEL-NEXT: str r0, [sp] +; LE-GISEL-NEXT: bl callee_soft_bfloat_on_stack +; LE-GISEL-NEXT: add sp, #8 +; LE-GISEL-NEXT: pop {r7, pc} +entry: + %ret = call arm_aapcscc bfloat @callee_soft_bfloat_on_stack(float poison, float poison, float poison, float poison, bfloat 1.0) + ret void +} + +define arm_aapcs_vfpcc bfloat @callee_hard_bfloat_in_reg(bfloat %f) { +; LE-LABEL: callee_hard_bfloat_in_reg: +; LE: @ %bb.0: @ %entry +; LE-NEXT: bx lr +; +; BE-LABEL: callee_hard_bfloat_in_reg: +; BE: @ %bb.0: @ %entry +; BE-NEXT: bx lr +; +; LE-BF16-LABEL: callee_hard_bfloat_in_reg: +; LE-BF16: @ %bb.0: @ %entry +; LE-BF16-NEXT: .pad #4 +; LE-BF16-NEXT: sub sp, #4 +; LE-BF16-NEXT: vmov r0, s0 +; LE-BF16-NEXT: strh.w r0, [sp, #2] +; LE-BF16-NEXT: ldrh.w r0, [sp, #2] +; LE-BF16-NEXT: vmov s0, r0 +; LE-BF16-NEXT: add sp, #4 +; LE-BF16-NEXT: bx lr +; +; BE-BF16-LABEL: callee_hard_bfloat_in_reg: +; BE-BF16: @ %bb.0: @ %entry +; BE-BF16-NEXT: .pad #4 +; BE-BF16-NEXT: sub sp, #4 +; BE-BF16-NEXT: vmov r0, s0 +; BE-BF16-NEXT: strh.w r0, [sp, #2] +; BE-BF16-NEXT: ldrh.w r0, [sp, #2] +; BE-BF16-NEXT: vmov s0, r0 +; BE-BF16-NEXT: add sp, #4 +; BE-BF16-NEXT: bx lr +; +; LE-GISEL-LABEL: callee_hard_bfloat_in_reg: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: bx lr +entry: + ret bfloat %f +} + +define void @caller_hard_bfloat_in_reg() { +; LE-LABEL: caller_hard_bfloat_in_reg: +; LE: @ %bb.0: @ %entry +; LE-NEXT: .save {r7, lr} +; LE-NEXT: push {r7, lr} +; LE-NEXT: vldr s0, .LCPI5_0 +; LE-NEXT: bl callee_hard_bfloat_in_reg +; LE-NEXT: pop {r7, pc} +; LE-NEXT: .p2align 2 +; LE-NEXT: @ %bb.1: +; LE-NEXT: .LCPI5_0: +; LE-NEXT: .long 0x00003f80 @ float 2.27795078E-41 +; +; BE-LABEL: caller_hard_bfloat_in_reg: +; BE: @ %bb.0: @ %entry +; BE-NEXT: .save {r7, lr} +; BE-NEXT: push {r7, lr} +; BE-NEXT: vldr s0, .LCPI5_0 +; BE-NEXT: bl callee_hard_bfloat_in_reg +; BE-NEXT: pop {r7, pc} +; BE-NEXT: .p2align 2 +; BE-NEXT: @ %bb.1: +; BE-NEXT: .LCPI5_0: +; BE-NEXT: .long 0x00003f80 @ float 2.27795078E-41 +; +; LE-BF16-LABEL: caller_hard_bfloat_in_reg: +; LE-BF16: @ %bb.0: @ %entry +; LE-BF16-NEXT: .save {r7, lr} +; LE-BF16-NEXT: push {r7, lr} +; LE-BF16-NEXT: vldr s0, .LCPI5_0 +; LE-BF16-NEXT: bl callee_hard_bfloat_in_reg +; LE-BF16-NEXT: pop {r7, pc} +; LE-BF16-NEXT: .p2align 2 +; LE-BF16-NEXT: @ %bb.1: +; LE-BF16-NEXT: .LCPI5_0: +; LE-BF16-NEXT: .long 0x00003f80 @ float 2.27795078E-41 +; +; BE-BF16-LABEL: caller_hard_bfloat_in_reg: +; BE-BF16: @ %bb.0: @ %entry +; BE-BF16-NEXT: .save {r7, lr} +; BE-BF16-NEXT: push {r7, lr} +; BE-BF16-NEXT: vldr s0, .LCPI5_0 +; BE-BF16-NEXT: bl callee_hard_bfloat_in_reg +; BE-BF16-NEXT: pop {r7, pc} +; BE-BF16-NEXT: .p2align 2 +; BE-BF16-NEXT: @ %bb.1: +; BE-BF16-NEXT: .LCPI5_0: +; BE-BF16-NEXT: .long 0x00003f80 @ float 2.27795078E-41 +; +; LE-GISEL-LABEL: caller_hard_bfloat_in_reg: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: .save {r7, lr} +; LE-GISEL-NEXT: push {r7, lr} +; LE-GISEL-NEXT: vldr s0, .LCPI5_0 +; LE-GISEL-NEXT: bl callee_hard_bfloat_in_reg +; LE-GISEL-NEXT: pop {r7, pc} +; LE-GISEL-NEXT: .p2align 2 +; LE-GISEL-NEXT: @ %bb.1: +; LE-GISEL-NEXT: .LCPI5_0: +; LE-GISEL-NEXT: .long 0x00003f80 @ float 2.27795078E-41 +entry: + %ret = call arm_aapcs_vfpcc bfloat @callee_hard_bfloat_in_reg(bfloat 1.0) + ret void +} + +define arm_aapcs_vfpcc bfloat @callee_hard_bfloat_on_stack(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7,float %s8, float %s9, float %s10, float %s11, float %s12, float %s13, float %s14, float %s15, bfloat %f) { +; LE-LABEL: callee_hard_bfloat_on_stack: +; LE: @ %bb.0: @ %entry +; LE-NEXT: vldr s0, [sp] +; LE-NEXT: bx lr +; +; BE-LABEL: callee_hard_bfloat_on_stack: +; BE: @ %bb.0: @ %entry +; BE-NEXT: vldr s0, [sp] +; BE-NEXT: bx lr +; +; LE-BF16-LABEL: callee_hard_bfloat_on_stack: +; LE-BF16: @ %bb.0: @ %entry +; LE-BF16-NEXT: ldrh.w r0, [sp] +; LE-BF16-NEXT: vmov s0, r0 +; LE-BF16-NEXT: bx lr +; +; BE-BF16-LABEL: callee_hard_bfloat_on_stack: +; BE-BF16: @ %bb.0: @ %entry +; BE-BF16-NEXT: ldrh.w r0, [sp, #2] +; BE-BF16-NEXT: vmov s0, r0 +; BE-BF16-NEXT: bx lr +; +; LE-GISEL-LABEL: callee_hard_bfloat_on_stack: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: mov r0, sp +; LE-GISEL-NEXT: ldr r0, [r0] +; LE-GISEL-NEXT: vmov s0, r0 +; LE-GISEL-NEXT: bx lr +entry: + ret bfloat %f +} + + +define void @caller_hard_bfloat_on_stack() { +; LE-LABEL: caller_hard_bfloat_on_stack: +; LE: @ %bb.0: @ %entry +; LE-NEXT: .save {r7, lr} +; LE-NEXT: push {r7, lr} +; LE-NEXT: .pad #8 +; LE-NEXT: sub sp, #8 +; LE-NEXT: mov.w r0, #16256 +; LE-NEXT: str r0, [sp] +; LE-NEXT: bl callee_hard_bfloat_on_stack +; LE-NEXT: add sp, #8 +; LE-NEXT: pop {r7, pc} +; +; BE-LABEL: caller_hard_bfloat_on_stack: +; BE: @ %bb.0: @ %entry +; BE-NEXT: .save {r7, lr} +; BE-NEXT: push {r7, lr} +; BE-NEXT: .pad #8 +; BE-NEXT: sub sp, #8 +; BE-NEXT: mov.w r0, #16256 +; BE-NEXT: str r0, [sp] +; BE-NEXT: bl callee_hard_bfloat_on_stack +; BE-NEXT: add sp, #8 +; BE-NEXT: pop {r7, pc} +; +; LE-BF16-LABEL: caller_hard_bfloat_on_stack: +; LE-BF16: @ %bb.0: @ %entry +; LE-BF16-NEXT: .save {r7, lr} +; LE-BF16-NEXT: push {r7, lr} +; LE-BF16-NEXT: .pad #8 +; LE-BF16-NEXT: sub sp, #8 +; LE-BF16-NEXT: mov.w r0, #16256 +; LE-BF16-NEXT: str r0, [sp] +; LE-BF16-NEXT: bl callee_hard_bfloat_on_stack +; LE-BF16-NEXT: add sp, #8 +; LE-BF16-NEXT: pop {r7, pc} +; +; BE-BF16-LABEL: caller_hard_bfloat_on_stack: +; BE-BF16: @ %bb.0: @ %entry +; BE-BF16-NEXT: .save {r7, lr} +; BE-BF16-NEXT: push {r7, lr} +; BE-BF16-NEXT: .pad #8 +; BE-BF16-NEXT: sub sp, #8 +; BE-BF16-NEXT: mov.w r0, #16256 +; BE-BF16-NEXT: str r0, [sp] +; BE-BF16-NEXT: bl callee_hard_bfloat_on_stack +; BE-BF16-NEXT: add sp, #8 +; BE-BF16-NEXT: pop {r7, pc} +; +; LE-GISEL-LABEL: caller_hard_bfloat_on_stack: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: .save {r7, lr} +; LE-GISEL-NEXT: push {r7, lr} +; LE-GISEL-NEXT: .pad #8 +; LE-GISEL-NEXT: sub sp, #8 +; LE-GISEL-NEXT: mov.w r0, #16256 +; LE-GISEL-NEXT: str r0, [sp] +; LE-GISEL-NEXT: bl callee_hard_bfloat_on_stack +; LE-GISEL-NEXT: add sp, #8 +; LE-GISEL-NEXT: pop {r7, pc} +entry: + %ret = call arm_aapcs_vfpcc bfloat @callee_hard_bfloat_on_stack(float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, bfloat 1.0) + ret void +} diff --git a/llvm/test/CodeGen/Thumb2/fp16-pcs.ll b/llvm/test/CodeGen/Thumb2/fp16-pcs.ll new file mode 100644 index 0000000000000..c0239a009f476 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/fp16-pcs.ll @@ -0,0 +1,360 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=armv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16 | FileCheck %s --check-prefix=LE +; RUN: llc -mtriple=armebv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16 | FileCheck %s --check-prefix=BE +; RUN: llc -mtriple=armv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16,+fullfp16 | FileCheck %s --check-prefix=LE-FP16 +; RUN: llc -mtriple=armebv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16,+fullfp16 | FileCheck %s --check-prefix=BE-FP16 + +;; Global ISel successfully generates code for some functions for little-endian +;; without +fullfp16, and falls back to SelectionDAG in all others. +; RUN: llc -mtriple=armv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16 -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefix=LE-GISEL +; RUN: llc -mtriple=armebv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16 -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefix=BE +; RUN: llc -mtriple=armv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16,+fullfp16 -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefix=LE-FP16 +; RUN: llc -mtriple=armebv8m.main-none-eabi < %s -frame-pointer=none -mattr=+fp-armv8d16,+fullfp16 -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefix=BE-FP16 + +define arm_aapcscc half @callee_soft_half_in_reg(half %f) { +; LE-LABEL: callee_soft_half_in_reg: +; LE: @ %bb.0: @ %entry +; LE-NEXT: bx lr +; +; BE-LABEL: callee_soft_half_in_reg: +; BE: @ %bb.0: @ %entry +; BE-NEXT: bx lr +; +; LE-FP16-LABEL: callee_soft_half_in_reg: +; LE-FP16: @ %bb.0: @ %entry +; LE-FP16-NEXT: vmov.f16 s0, r0 +; LE-FP16-NEXT: vmov r0, s0 +; LE-FP16-NEXT: bx lr +; +; BE-FP16-LABEL: callee_soft_half_in_reg: +; BE-FP16: @ %bb.0: @ %entry +; BE-FP16-NEXT: vmov.f16 s0, r0 +; BE-FP16-NEXT: vmov r0, s0 +; BE-FP16-NEXT: bx lr +; +; LE-GISEL-LABEL: callee_soft_half_in_reg: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: bx lr +entry: + ret half %f +} + +define void @caller_soft_half_in_reg() { +; LE-LABEL: caller_soft_half_in_reg: +; LE: @ %bb.0: @ %entry +; LE-NEXT: .save {r7, lr} +; LE-NEXT: push {r7, lr} +; LE-NEXT: mov.w r0, #15360 +; LE-NEXT: bl callee_soft_half_in_reg +; LE-NEXT: pop {r7, pc} +; +; BE-LABEL: caller_soft_half_in_reg: +; BE: @ %bb.0: @ %entry +; BE-NEXT: .save {r7, lr} +; BE-NEXT: push {r7, lr} +; BE-NEXT: mov.w r0, #15360 +; BE-NEXT: bl callee_soft_half_in_reg +; BE-NEXT: pop {r7, pc} +; +; LE-FP16-LABEL: caller_soft_half_in_reg: +; LE-FP16: @ %bb.0: @ %entry +; LE-FP16-NEXT: .save {r7, lr} +; LE-FP16-NEXT: push {r7, lr} +; LE-FP16-NEXT: mov.w r0, #15360 +; LE-FP16-NEXT: bl callee_soft_half_in_reg +; LE-FP16-NEXT: pop {r7, pc} +; +; BE-FP16-LABEL: caller_soft_half_in_reg: +; BE-FP16: @ %bb.0: @ %entry +; BE-FP16-NEXT: .save {r7, lr} +; BE-FP16-NEXT: push {r7, lr} +; BE-FP16-NEXT: mov.w r0, #15360 +; BE-FP16-NEXT: bl callee_soft_half_in_reg +; BE-FP16-NEXT: pop {r7, pc} +; +; LE-GISEL-LABEL: caller_soft_half_in_reg: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: .save {r7, lr} +; LE-GISEL-NEXT: push {r7, lr} +; LE-GISEL-NEXT: mov.w r0, #15360 +; LE-GISEL-NEXT: bl callee_soft_half_in_reg +; LE-GISEL-NEXT: pop {r7, pc} +entry: + %ret = call arm_aapcscc half @callee_soft_half_in_reg(half 1.0) + ret void +} + +define arm_aapcscc half @callee_soft_half_on_stack(float %r0, float %r1, float %r2, float %r3, half %f) { +; LE-LABEL: callee_soft_half_on_stack: +; LE: @ %bb.0: @ %entry +; LE-NEXT: ldr r0, [sp] +; LE-NEXT: bx lr +; +; BE-LABEL: callee_soft_half_on_stack: +; BE: @ %bb.0: @ %entry +; BE-NEXT: ldr r0, [sp] +; BE-NEXT: bx lr +; +; LE-FP16-LABEL: callee_soft_half_on_stack: +; LE-FP16: @ %bb.0: @ %entry +; LE-FP16-NEXT: vldr.16 s0, [sp] +; LE-FP16-NEXT: vmov r0, s0 +; LE-FP16-NEXT: bx lr +; +; BE-FP16-LABEL: callee_soft_half_on_stack: +; BE-FP16: @ %bb.0: @ %entry +; BE-FP16-NEXT: vldr.16 s0, [sp, #2] +; BE-FP16-NEXT: vmov r0, s0 +; BE-FP16-NEXT: bx lr +; +; LE-GISEL-LABEL: callee_soft_half_on_stack: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: mov r0, sp +; LE-GISEL-NEXT: ldr r0, [r0] +; LE-GISEL-NEXT: bx lr +entry: + ret half %f +} + +define void @caller_soft_half_on_stack() { +; LE-LABEL: caller_soft_half_on_stack: +; LE: @ %bb.0: @ %entry +; LE-NEXT: .save {r7, lr} +; LE-NEXT: push {r7, lr} +; LE-NEXT: .pad #8 +; LE-NEXT: sub sp, #8 +; LE-NEXT: mov.w r0, #15360 +; LE-NEXT: str r0, [sp] +; LE-NEXT: bl callee_soft_half_on_stack +; LE-NEXT: add sp, #8 +; LE-NEXT: pop {r7, pc} +; +; BE-LABEL: caller_soft_half_on_stack: +; BE: @ %bb.0: @ %entry +; BE-NEXT: .save {r7, lr} +; BE-NEXT: push {r7, lr} +; BE-NEXT: .pad #8 +; BE-NEXT: sub sp, #8 +; BE-NEXT: mov.w r0, #15360 +; BE-NEXT: str r0, [sp] +; BE-NEXT: bl callee_soft_half_on_stack +; BE-NEXT: add sp, #8 +; BE-NEXT: pop {r7, pc} +; +; LE-FP16-LABEL: caller_soft_half_on_stack: +; LE-FP16: @ %bb.0: @ %entry +; LE-FP16-NEXT: .save {r7, lr} +; LE-FP16-NEXT: push {r7, lr} +; LE-FP16-NEXT: .pad #8 +; LE-FP16-NEXT: sub sp, #8 +; LE-FP16-NEXT: mov.w r0, #15360 +; LE-FP16-NEXT: str r0, [sp] +; LE-FP16-NEXT: bl callee_soft_half_on_stack +; LE-FP16-NEXT: add sp, #8 +; LE-FP16-NEXT: pop {r7, pc} +; +; BE-FP16-LABEL: caller_soft_half_on_stack: +; BE-FP16: @ %bb.0: @ %entry +; BE-FP16-NEXT: .save {r7, lr} +; BE-FP16-NEXT: push {r7, lr} +; BE-FP16-NEXT: .pad #8 +; BE-FP16-NEXT: sub sp, #8 +; BE-FP16-NEXT: mov.w r0, #15360 +; BE-FP16-NEXT: str r0, [sp] +; BE-FP16-NEXT: bl callee_soft_half_on_stack +; BE-FP16-NEXT: add sp, #8 +; BE-FP16-NEXT: pop {r7, pc} +; +; LE-GISEL-LABEL: caller_soft_half_on_stack: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: .save {r7, lr} +; LE-GISEL-NEXT: push {r7, lr} +; LE-GISEL-NEXT: .pad #8 +; LE-GISEL-NEXT: sub sp, #8 +; LE-GISEL-NEXT: mov.w r0, #15360 +; LE-GISEL-NEXT: str r0, [sp] +; LE-GISEL-NEXT: bl callee_soft_half_on_stack +; LE-GISEL-NEXT: add sp, #8 +; LE-GISEL-NEXT: pop {r7, pc} +entry: + %ret = call arm_aapcscc half @callee_soft_half_on_stack(float poison, float poison, float poison, float poison, half 1.0) + ret void +} + +define arm_aapcs_vfpcc half @callee_hard_half_in_reg(half %f) { +; LE-LABEL: callee_hard_half_in_reg: +; LE: @ %bb.0: @ %entry +; LE-NEXT: bx lr +; +; BE-LABEL: callee_hard_half_in_reg: +; BE: @ %bb.0: @ %entry +; BE-NEXT: bx lr +; +; LE-FP16-LABEL: callee_hard_half_in_reg: +; LE-FP16: @ %bb.0: @ %entry +; LE-FP16-NEXT: bx lr +; +; BE-FP16-LABEL: callee_hard_half_in_reg: +; BE-FP16: @ %bb.0: @ %entry +; BE-FP16-NEXT: bx lr +; +; LE-GISEL-LABEL: callee_hard_half_in_reg: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: bx lr +entry: + ret half %f +} + +define void @caller_hard_half_in_reg() { +; LE-LABEL: caller_hard_half_in_reg: +; LE: @ %bb.0: @ %entry +; LE-NEXT: .save {r7, lr} +; LE-NEXT: push {r7, lr} +; LE-NEXT: vldr s0, .LCPI5_0 +; LE-NEXT: bl callee_hard_half_in_reg +; LE-NEXT: pop {r7, pc} +; LE-NEXT: .p2align 2 +; LE-NEXT: @ %bb.1: +; LE-NEXT: .LCPI5_0: +; LE-NEXT: .long 0x00003c00 @ float 2.15239444E-41 +; +; BE-LABEL: caller_hard_half_in_reg: +; BE: @ %bb.0: @ %entry +; BE-NEXT: .save {r7, lr} +; BE-NEXT: push {r7, lr} +; BE-NEXT: vldr s0, .LCPI5_0 +; BE-NEXT: bl callee_hard_half_in_reg +; BE-NEXT: pop {r7, pc} +; BE-NEXT: .p2align 2 +; BE-NEXT: @ %bb.1: +; BE-NEXT: .LCPI5_0: +; BE-NEXT: .long 0x00003c00 @ float 2.15239444E-41 +; +; LE-FP16-LABEL: caller_hard_half_in_reg: +; LE-FP16: @ %bb.0: @ %entry +; LE-FP16-NEXT: .save {r7, lr} +; LE-FP16-NEXT: push {r7, lr} +; LE-FP16-NEXT: vmov.f16 s0, #1.000000e+00 +; LE-FP16-NEXT: bl callee_hard_half_in_reg +; LE-FP16-NEXT: pop {r7, pc} +; +; BE-FP16-LABEL: caller_hard_half_in_reg: +; BE-FP16: @ %bb.0: @ %entry +; BE-FP16-NEXT: .save {r7, lr} +; BE-FP16-NEXT: push {r7, lr} +; BE-FP16-NEXT: vmov.f16 s0, #1.000000e+00 +; BE-FP16-NEXT: bl callee_hard_half_in_reg +; BE-FP16-NEXT: pop {r7, pc} +; +; LE-GISEL-LABEL: caller_hard_half_in_reg: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: .save {r7, lr} +; LE-GISEL-NEXT: push {r7, lr} +; LE-GISEL-NEXT: vldr s0, .LCPI5_0 +; LE-GISEL-NEXT: bl callee_hard_half_in_reg +; LE-GISEL-NEXT: pop {r7, pc} +; LE-GISEL-NEXT: .p2align 2 +; LE-GISEL-NEXT: @ %bb.1: +; LE-GISEL-NEXT: .LCPI5_0: +; LE-GISEL-NEXT: .long 0x00003c00 @ float 2.15239444E-41 +entry: + %ret = call arm_aapcs_vfpcc half @callee_hard_half_in_reg(half 1.0) + ret void +} + +define arm_aapcs_vfpcc half @callee_hard_half_on_stack(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7,float %s8, float %s9, float %s10, float %s11, float %s12, float %s13, float %s14, float %s15, half %f) { +; LE-LABEL: callee_hard_half_on_stack: +; LE: @ %bb.0: @ %entry +; LE-NEXT: vldr s0, [sp] +; LE-NEXT: bx lr +; +; BE-LABEL: callee_hard_half_on_stack: +; BE: @ %bb.0: @ %entry +; BE-NEXT: vldr s0, [sp] +; BE-NEXT: bx lr +; +; LE-FP16-LABEL: callee_hard_half_on_stack: +; LE-FP16: @ %bb.0: @ %entry +; LE-FP16-NEXT: vldr.16 s0, [sp] +; LE-FP16-NEXT: bx lr +; +; BE-FP16-LABEL: callee_hard_half_on_stack: +; BE-FP16: @ %bb.0: @ %entry +; BE-FP16-NEXT: vldr.16 s0, [sp, #2] +; BE-FP16-NEXT: bx lr +; +; LE-GISEL-LABEL: callee_hard_half_on_stack: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: mov r0, sp +; LE-GISEL-NEXT: ldr r0, [r0] +; LE-GISEL-NEXT: vmov s0, r0 +; LE-GISEL-NEXT: bx lr +entry: + ret half %f +} + + +define void @caller_hard_half_on_stack() { +; LE-LABEL: caller_hard_half_on_stack: +; LE: @ %bb.0: @ %entry +; LE-NEXT: .save {r7, lr} +; LE-NEXT: push {r7, lr} +; LE-NEXT: .pad #8 +; LE-NEXT: sub sp, #8 +; LE-NEXT: mov.w r0, #15360 +; LE-NEXT: str r0, [sp] +; LE-NEXT: bl callee_hard_half_on_stack +; LE-NEXT: add sp, #8 +; LE-NEXT: pop {r7, pc} +; +; BE-LABEL: caller_hard_half_on_stack: +; BE: @ %bb.0: @ %entry +; BE-NEXT: .save {r7, lr} +; BE-NEXT: push {r7, lr} +; BE-NEXT: .pad #8 +; BE-NEXT: sub sp, #8 +; BE-NEXT: mov.w r0, #15360 +; BE-NEXT: str r0, [sp] +; BE-NEXT: bl callee_hard_half_on_stack +; BE-NEXT: add sp, #8 +; BE-NEXT: pop {r7, pc} +; +; LE-FP16-LABEL: caller_hard_half_on_stack: +; LE-FP16: @ %bb.0: @ %entry +; LE-FP16-NEXT: .save {r7, lr} +; LE-FP16-NEXT: push {r7, lr} +; LE-FP16-NEXT: .pad #8 +; LE-FP16-NEXT: sub sp, #8 +; LE-FP16-NEXT: mov.w r0, #15360 +; LE-FP16-NEXT: str r0, [sp] +; LE-FP16-NEXT: bl callee_hard_half_on_stack +; LE-FP16-NEXT: add sp, #8 +; LE-FP16-NEXT: pop {r7, pc} +; +; BE-FP16-LABEL: caller_hard_half_on_stack: +; BE-FP16: @ %bb.0: @ %entry +; BE-FP16-NEXT: .save {r7, lr} +; BE-FP16-NEXT: push {r7, lr} +; BE-FP16-NEXT: .pad #8 +; BE-FP16-NEXT: sub sp, #8 +; BE-FP16-NEXT: mov.w r0, #15360 +; BE-FP16-NEXT: str r0, [sp] +; BE-FP16-NEXT: bl callee_hard_half_on_stack +; BE-FP16-NEXT: add sp, #8 +; BE-FP16-NEXT: pop {r7, pc} +; +; LE-GISEL-LABEL: caller_hard_half_on_stack: +; LE-GISEL: @ %bb.0: @ %entry +; LE-GISEL-NEXT: .save {r7, lr} +; LE-GISEL-NEXT: push {r7, lr} +; LE-GISEL-NEXT: .pad #8 +; LE-GISEL-NEXT: sub sp, #8 +; LE-GISEL-NEXT: mov.w r0, #15360 +; LE-GISEL-NEXT: str r0, [sp] +; LE-GISEL-NEXT: bl callee_hard_half_on_stack +; LE-GISEL-NEXT: add sp, #8 +; LE-GISEL-NEXT: pop {r7, pc} +entry: + %ret = call arm_aapcs_vfpcc half @callee_hard_half_on_stack(float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, half 1.0) + ret void +} diff --git a/llvm/test/CodeGen/X86/2011-10-27-tstore.ll b/llvm/test/CodeGen/X86/2011-10-27-tstore.ll index a31d3ad45fe4f..a3b79c291aefa 100644 --- a/llvm/test/CodeGen/X86/2011-10-27-tstore.ll +++ b/llvm/test/CodeGen/X86/2011-10-27-tstore.ll @@ -1,12 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mcpu=corei7 | FileCheck %s target triple = "x86_64-unknown-linux-gnu" -;CHECK-LABEL: ltstore: -;CHECK: movq -;CHECK: movq -;CHECK: ret define void @ltstore(ptr %pA, ptr %pB) { +; CHECK-LABEL: ltstore: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: movq %rax, (%rsi) +; CHECK-NEXT: retq entry: %in = load <4 x i32>, ptr %pA %j = shufflevector <4 x i32> %in, <4 x i32> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/align-down.ll b/llvm/test/CodeGen/X86/align-down.ll index 3c64e108e06dd..c359c04f527a3 100644 --- a/llvm/test/CodeGen/X86/align-down.ll +++ b/llvm/test/CodeGen/X86/align-down.ll @@ -82,25 +82,40 @@ define i32 @t2_commutative(i32 %ptr, i32 %alignment) nounwind { ; Extra use tests define i32 @t3_extrause0(i32 %ptr, i32 %alignment, ptr %mask_storage) nounwind { -; X86-LABEL: t3_extrause0: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal -1(%eax), %edx -; X86-NEXT: movl %edx, (%ecx) -; X86-NEXT: negl %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: retl +; NOBMI-X86-LABEL: t3_extrause0: +; NOBMI-X86: # %bb.0: +; NOBMI-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; NOBMI-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; NOBMI-X86-NEXT: decl %eax +; NOBMI-X86-NEXT: movl %eax, (%ecx) +; NOBMI-X86-NEXT: notl %eax +; NOBMI-X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; NOBMI-X86-NEXT: retl ; -; X64-LABEL: t3_extrause0: -; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: leal -1(%rax), %ecx -; X64-NEXT: movl %ecx, (%rdx) -; X64-NEXT: negl %eax -; X64-NEXT: andl %edi, %eax -; X64-NEXT: # kill: def $eax killed $eax killed $rax -; X64-NEXT: retq +; BMI-X86-LABEL: t3_extrause0: +; BMI-X86: # %bb.0: +; BMI-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; BMI-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; BMI-X86-NEXT: decl %ecx +; BMI-X86-NEXT: movl %ecx, (%eax) +; BMI-X86-NEXT: andnl {{[0-9]+}}(%esp), %ecx, %eax +; BMI-X86-NEXT: retl +; +; NOBMI-X64-LABEL: t3_extrause0: +; NOBMI-X64: # %bb.0: +; NOBMI-X64-NEXT: # kill: def $esi killed $esi def $rsi +; NOBMI-X64-NEXT: leal -1(%rsi), %eax +; NOBMI-X64-NEXT: movl %eax, (%rdx) +; NOBMI-X64-NEXT: notl %eax +; NOBMI-X64-NEXT: andl %edi, %eax +; NOBMI-X64-NEXT: retq +; +; BMI-X64-LABEL: t3_extrause0: +; BMI-X64: # %bb.0: +; BMI-X64-NEXT: decl %esi +; BMI-X64-NEXT: movl %esi, (%rdx) +; BMI-X64-NEXT: andnl %edi, %esi, %eax +; BMI-X64-NEXT: retq %mask = add i32 %alignment, -1 store i32 %mask, ptr %mask_storage %bias = and i32 %ptr, %mask diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index cad1d09f11d9c..4c4d5cb3166a8 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1220,7 +1220,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -1234,7 +1234,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -1247,7 +1247,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -1259,10 +1259,9 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -1345,7 +1344,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -1359,7 +1358,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -1372,7 +1371,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -1384,10 +1383,9 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -1719,7 +1717,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1732,7 +1730,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -1745,7 +1743,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -2691,14 +2689,13 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2708,10 +2705,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2 -; AVX512F-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0)) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -2724,10 +2720,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2 -; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) @@ -2739,11 +2734,10 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2 +; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} -; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -2959,14 +2953,13 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2976,10 +2969,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0)) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -2992,10 +2984,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) @@ -3007,11 +2998,10 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} -; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3742,14 +3732,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3759,9 +3748,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3775,9 +3763,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 3d72319f59ca9..16f0614743463 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -910,10 +910,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1010,8 +1008,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -1022,8 +1019,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -1032,8 +1029,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; AVX512DQ-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -1041,12 +1038,11 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; ; AVX512BW-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1108,8 +1104,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -1120,8 +1115,8 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; AVX512F-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -1130,8 +1125,8 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; AVX512DQ-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -1139,12 +1134,11 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; ; AVX512BW-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1307,11 +1301,8 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper @@ -1397,8 +1388,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15] ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1407,7 +1397,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512F-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -1416,7 +1406,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512DQ-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -1675,10 +1665,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] -; AVX512F-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512F-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1694,10 +1683,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512DQ-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -1713,10 +1701,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] -; AVX512BW-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512BW-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -1903,17 +1890,15 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; ; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX2-NEXT: vpbroadcastb (%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2136,8 +2121,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 @@ -2150,8 +2134,8 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2164,8 +2148,8 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2178,10 +2162,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512BW-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512BW-NEXT: vpbroadcastd (%rdi), %ymm1 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -2369,8 +2352,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 @@ -2383,8 +2365,8 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2397,8 +2379,8 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2411,10 +2393,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX512BW-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm1 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -2824,10 +2805,8 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) @@ -3033,8 +3012,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -3046,7 +3024,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512F-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm1 +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -3059,7 +3037,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512DQ-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm1 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -3634,19 +3612,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -3820,19 +3797,18 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll index f4eb5b952ae43..4da29715f1555 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1998,10 +1998,7 @@ define <8 x half> @test21(half %a, half %b, half %c) nounwind { ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero ; X64-NEXT: retq ; ; X86-LABEL: test21: @@ -2010,10 +2007,7 @@ define <8 x half> @test21(half %a, half %b, half %c) nounwind { ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpbroadcastw %xmm1, %xmm1 -; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; X86-NEXT: retl %1 = insertelement <8 x half> , half %a, i32 0 %2 = insertelement <8 x half> %1, half %b, i32 1 diff --git a/llvm/test/CodeGen/X86/coalescer-subreg.ll b/llvm/test/CodeGen/X86/coalescer-subreg.ll index b0f30b43b9200..5115a0658ad05 100644 --- a/llvm/test/CodeGen/X86/coalescer-subreg.ll +++ b/llvm/test/CodeGen/X86/coalescer-subreg.ll @@ -3,9 +3,9 @@ ; the sub_8bit_hi subregister with a class like GR16_ABCD that did. target triple = "x86_64-apple-macosx10.10.0" -define void @test() #0 { +define void @test(i1 %arg) #0 { entry: - br i1 undef, label %loop, label %for.end597 + br i1 %arg, label %loop, label %for.end597 loop: %0 = load i16, ptr null, align 4 diff --git a/llvm/test/CodeGen/X86/code_placement_eh.ll b/llvm/test/CodeGen/X86/code_placement_eh.ll index 160ead782473b..56d642fc0e0e1 100644 --- a/llvm/test/CodeGen/X86/code_placement_eh.ll +++ b/llvm/test/CodeGen/X86/code_placement_eh.ll @@ -6,12 +6,12 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" target triple = "i386-apple-darwin10.0" -define void @foo() personality ptr @__gxx_personality_v0 { +define void @foo(i1 %arg) personality ptr @__gxx_personality_v0 { invcont5: br label %bb15 .noexc3: ; preds = %bb15 - br i1 undef, label %bb18.i5.i, label %bb15 + br i1 %arg, label %bb18.i5.i, label %bb15 .noexc6.i.i: ; preds = %bb18.i5.i %tmp2021 = invoke float @cosf(float 0.000000e+00) readonly diff --git a/llvm/test/CodeGen/X86/codegen-prepare-cast.ll b/llvm/test/CodeGen/X86/codegen-prepare-cast.ll index c55d53258beba..5eb66f0282244 100644 --- a/llvm/test/CodeGen/X86/codegen-prepare-cast.ll +++ b/llvm/test/CodeGen/X86/codegen-prepare-cast.ll @@ -12,10 +12,10 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK-LABEL: @_Dmain ; CHECK: load i8, ptr %tmp4 ; CHECK: ret -define fastcc i32 @_Dmain(%"char[][]" %unnamed) { +define fastcc i32 @_Dmain(%"char[][]" %unnamed, i1 %arg) { entry: %tmp = getelementptr [7 x i8], ptr @.str, i32 0, i32 0 ; [#uses=1] - br i1 undef, label %foreachbody, label %foreachend + br i1 %arg, label %foreachbody, label %foreachend foreachbody: ; preds = %entry %tmp4 = getelementptr i8, ptr %tmp, i32 undef ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll index 26a88ab15e3cc..d9393ba9febb2 100644 --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -733,8 +733,8 @@ define <16 x i16> @vec256_eltty_i16_source_subvec_1_target_subvec_mask_3_unary(< define <16 x i16> @vec256_eltty_i16_source_subvec_1_target_subvec_mask_3_binary(<16 x i16> %x, <16 x i16> %y) nounwind { ; CHECK-LABEL: vec256_eltty_i16_source_subvec_1_target_subvec_mask_3_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17] +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vpbroadcastw %xmm1, %ymm1 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; CHECK-NEXT: retq %r = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> @@ -799,8 +799,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_unary(<32 define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_binary(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16] +; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -870,8 +869,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_unary(<32 define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_binary(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16] +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/discontiguous-loops.ll b/llvm/test/CodeGen/X86/discontiguous-loops.ll index f87dcd9e65388..ec65fe99c74d0 100644 --- a/llvm/test/CodeGen/X86/discontiguous-loops.ll +++ b/llvm/test/CodeGen/X86/discontiguous-loops.ll @@ -3,7 +3,7 @@ @.str96 = external constant [37 x i8], align 8 ; [#uses=1] -define void @foo() nounwind { +define void @foo(i1 %arg) nounwind { bb: br label %ybb1 @@ -24,7 +24,7 @@ bb3: ; preds = %ybb2 xbb4: ; preds = %xbb6 store i32 0, ptr undef, align 8 - br i1 undef, label %xbb6, label %bb5 + br i1 %arg, label %xbb6, label %bb5 bb5: ; preds = %xbb4 call fastcc void @decl_mode_check_failed() nounwind @@ -44,7 +44,7 @@ bb10: ; preds = %ybb8 unreachable ybb12: ; preds = %ybb8 - br i1 undef, label %bb15, label %ybb13 + br i1 %arg, label %bb15, label %ybb13 ybb13: ; preds = %ybb12 %tmp14 = icmp sgt i32 undef, 0 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/early-ifcvt-crash.ll b/llvm/test/CodeGen/X86/early-ifcvt-crash.ll index 3cf98af1cfe70..28523c4a9d890 100644 --- a/llvm/test/CodeGen/X86/early-ifcvt-crash.ll +++ b/llvm/test/CodeGen/X86/early-ifcvt-crash.ll @@ -10,15 +10,15 @@ target triple = "x86_64-apple-macosx10.8.0" ; MachineTraceMetrics::Ensemble::addLiveIns crashes because the first operand ; on an inline asm instruction is not a vreg def. ; -define void @f1() nounwind { +define void @f1(i1 %arg) nounwind { entry: - br i1 undef, label %if.then6.i, label %if.end.i + br i1 %arg, label %if.then6.i, label %if.end.i if.then6.i: br label %if.end.i if.end.i: - br i1 undef, label %if.end25.i, label %if.else17.i + br i1 %arg, label %if.end25.i, label %if.else17.i if.else17.i: %shl24.i = shl i32 undef, undef diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll index dea29b7b5b93d..632f3c6c1e851 100644 --- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll @@ -531,9 +531,11 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vpbroadcastw %xmm0, %xmm1 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,0] -; AVX512-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: addq $40, %rsp ; AVX512-NEXT: .cfi_def_cfa_offset 8 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fast-isel-stackcheck.ll b/llvm/test/CodeGen/X86/fast-isel-stackcheck.ll index 10e192e385018..a4e5ae66b1fd8 100644 --- a/llvm/test/CodeGen/X86/fast-isel-stackcheck.ll +++ b/llvm/test/CodeGen/X86/fast-isel-stackcheck.ll @@ -19,16 +19,16 @@ entry: ; CHECK-DAG: movq ___stack_chk_guard@GOTPCREL(%rip), %[[GUARD:r.x]] ; CHECK-DAG: movq {{[0-9]+}}(%rsp), %[[CANARY:r.x]] ; CHECK: subq %[[CANARY]], %[[GUARD]] -define void @bar() #1 { +define void @bar(i1 %arg) #1 { entry: %vt = alloca [2 x double], align 16 - br i1 undef, label %cleanup.4091, label %for.cond.3850 + br i1 %arg, label %cleanup.4091, label %for.cond.3850 unreachable: unreachable for.cond.3850: - br i1 undef, label %land.rhs.3853, label %land.end.3857 + br i1 %arg, label %land.rhs.3853, label %land.end.3857 land.rhs.3853: br label %land.end.3857 diff --git a/llvm/test/CodeGen/X86/fp-fold.ll b/llvm/test/CodeGen/X86/fp-fold.ll index 74b5232a4df62..b86894cadecc4 100644 --- a/llvm/test/CodeGen/X86/fp-fold.ll +++ b/llvm/test/CodeGen/X86/fp-fold.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s define float @fadd_zero_strict(float %x) { @@ -31,7 +32,7 @@ define float @fadd_produce_zero(float %x) { define float @fadd_reassociate(float %x) { ; CHECK-LABEL: fadd_reassociate: ; CHECK: # %bb.0: -; CHECK-NEXT: addss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %sum = fadd float %x, 8.0 %r = fadd reassoc nsz float %sum, 12.0 @@ -85,7 +86,7 @@ define float @fsub_neg_x_y(float %x, float %y) { define float @fsub_neg_y(float %x, float %y) { ; CHECK-LABEL: fsub_neg_y: ; CHECK: # %bb.0: -; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul float %x, 5.0 %add = fadd float %mul, %y @@ -96,7 +97,7 @@ define float @fsub_neg_y(float %x, float %y) { define <4 x float> @fsub_neg_y_vector(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: fsub_neg_y_vector: ; CHECK: # %bb.0: -; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul <4 x float> %x, %add = fadd <4 x float> %mul, %y @@ -107,7 +108,7 @@ define <4 x float> @fsub_neg_y_vector(<4 x float> %x, <4 x float> %y) { define <4 x float> @fsub_neg_y_vector_nonuniform(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: fsub_neg_y_vector_nonuniform: ; CHECK: # %bb.0: -; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul <4 x float> %x, %add = fadd <4 x float> %mul, %y @@ -118,7 +119,7 @@ define <4 x float> @fsub_neg_y_vector_nonuniform(<4 x float> %x, <4 x float> %y) define float @fsub_neg_y_commute(float %x, float %y) { ; CHECK-LABEL: fsub_neg_y_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul float %x, 5.0 %add = fadd float %y, %mul @@ -129,7 +130,7 @@ define float @fsub_neg_y_commute(float %x, float %y) { define <4 x float> @fsub_neg_y_commute_vector(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: fsub_neg_y_commute_vector: ; CHECK: # %bb.0: -; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul <4 x float> %x, %add = fadd <4 x float> %y, %mul @@ -142,7 +143,7 @@ define <4 x float> @fsub_neg_y_commute_vector(<4 x float> %x, <4 x float> %y) { define float @fsub_fadd_common_op_fneg(float %x, float %y) { ; CHECK-LABEL: fsub_fadd_common_op_fneg: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %a = fadd float %x, %y %r = fsub reassoc nsz float %y, %a @@ -154,7 +155,7 @@ define float @fsub_fadd_common_op_fneg(float %x, float %y) { define <4 x float> @fsub_fadd_common_op_fneg_vec(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: fsub_fadd_common_op_fneg_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %a = fadd <4 x float> %x, %y %r = fsub nsz reassoc <4 x float> %y, %a @@ -167,7 +168,7 @@ define <4 x float> @fsub_fadd_common_op_fneg_vec(<4 x float> %x, <4 x float> %y) define float @fsub_fadd_common_op_fneg_commute(float %x, float %y) { ; CHECK-LABEL: fsub_fadd_common_op_fneg_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %a = fadd float %y, %x %r = fsub reassoc nsz float %y, %a @@ -179,7 +180,7 @@ define float @fsub_fadd_common_op_fneg_commute(float %x, float %y) { define <4 x float> @fsub_fadd_common_op_fneg_commute_vec(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: fsub_fadd_common_op_fneg_commute_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %a = fadd <4 x float> %y, %x %r = fsub reassoc nsz <4 x float> %y, %a @@ -233,7 +234,7 @@ define float @fsub_zero_nsz_1(float %x) { define float @fsub_zero_nsz_2(float %x) { ; CHECK-LABEL: fsub_zero_nsz_2: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %r = fsub nsz float 0.0, %x ret float %r @@ -259,7 +260,7 @@ define float @fmul_one(float %x) { define float @fmul_x_const_const(float %x) { ; CHECK-LABEL: fmul_x_const_const: ; CHECK: # %bb.0: -; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul reassoc float %x, 9.0 %r = fmul reassoc float %mul, 4.0 diff --git a/llvm/test/CodeGen/X86/fp-stack-O0-crash.ll b/llvm/test/CodeGen/X86/fp-stack-O0-crash.ll index b40365db05246..9104a10690015 100644 --- a/llvm/test/CodeGen/X86/fp-stack-O0-crash.ll +++ b/llvm/test/CodeGen/X86/fp-stack-O0-crash.ll @@ -31,14 +31,14 @@ if.end: ; preds = %if.then, %cond.fals ; PR10575 ; This produces a FP0 = IMPLICIT_DEF instruction. -define void @__m_rankmerge_MOD_dindexmerge_() nounwind { +define void @__m_rankmerge_MOD_dindexmerge_(i1 %arg) nounwind { entry: br label %"20" "20": ; preds = %"23", %entry %0 = phi double [ undef, %entry ], [ %0, %"23" ] %1 = phi double [ 0.000000e+00, %entry ], [ %2, %"23" ] - br i1 undef, label %"21", label %"23" + br i1 %arg, label %"21", label %"23" "21": ; preds = %"20" ret void diff --git a/llvm/test/CodeGen/X86/fp-stack.ll b/llvm/test/CodeGen/X86/fp-stack.ll index 8af656e2861ec..33a4d594d2bd7 100644 --- a/llvm/test/CodeGen/X86/fp-stack.ll +++ b/llvm/test/CodeGen/X86/fp-stack.ll @@ -3,12 +3,12 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" target triple = "i386-pc-linux-gnu" -define void @foo() nounwind { +define void @foo(i1 %arg) nounwind { entry: %tmp6 = load x86_fp80, ptr undef ; [#uses=2] %tmp15 = load x86_fp80, ptr undef ; [#uses=2] %tmp24 = load x86_fp80, ptr undef ; [#uses=1] - br i1 undef, label %return, label %bb.nph + br i1 %arg, label %return, label %bb.nph bb.nph: ; preds = %entry %cmp139 = fcmp ogt x86_fp80 %tmp15, %tmp6 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/implicit-null-chk-reg-rewrite.mir b/llvm/test/CodeGen/X86/implicit-null-chk-reg-rewrite.mir index d5afd3df0e73d..2dac678a49845 100644 --- a/llvm/test/CodeGen/X86/implicit-null-chk-reg-rewrite.mir +++ b/llvm/test/CodeGen/X86/implicit-null-chk-reg-rewrite.mir @@ -1,9 +1,9 @@ # RUN: llc -mtriple=x86_64 -run-pass=implicit-null-checks %s -o - | FileCheck %s --- | - define i32 @reg-rewrite(ptr %x) { + define i32 @reg-rewrite(ptr %x, i1 %arg) { entry: - br i1 undef, label %is_null, label %not_null, !make.implicit !0 + br i1 %arg, label %is_null, label %not_null, !make.implicit !0 is_null: ret i32 42 diff --git a/llvm/test/CodeGen/X86/insert-positions.ll b/llvm/test/CodeGen/X86/insert-positions.ll index 92659f21a4b90..dab2b51a60852 100644 --- a/llvm/test/CodeGen/X86/insert-positions.ll +++ b/llvm/test/CodeGen/X86/insert-positions.ll @@ -2,12 +2,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" -define void @test0() nounwind { +define void @test0(i1 %arg) nounwind { if.end90.i.i: br label %while.body.i.i221.i while.body.i.i221.i: ; preds = %while.cond.backedge.i.i.i, %if.end90.i.i - br i1 undef, label %if.then.i.i224.i, label %while.cond.backedge.i.i.i + br i1 %arg, label %if.then.i.i224.i, label %while.cond.backedge.i.i.i while.cond.backedge.i.i.i: ; preds = %for.end.i.i.i, %while.body.i.i221.i br label %while.body.i.i221.i @@ -29,10 +29,10 @@ for.cond.i.i226.i: ; preds = %for.body.i.i.i, %if %0 = phi i64 [ %tmp154.i.i.i, %for.body.i.i.i ], [ 0, %if.then.i.i224.i ] ; [#uses=2] %tmp154.i.i.i = add i64 %0, 1 ; [#uses=2] %i.0.i.i.i = trunc i64 %0 to i32 ; [#uses=1] - br i1 undef, label %land.rhs.i.i.i, label %for.end.i.i.i + br i1 %arg, label %land.rhs.i.i.i, label %for.end.i.i.i land.rhs.i.i.i: ; preds = %for.cond.i.i226.i - br i1 undef, label %for.body.i.i.i, label %for.end.i.i.i + br i1 %arg, label %for.body.i.i.i, label %for.end.i.i.i for.body.i.i.i: ; preds = %land.rhs.i.i.i br label %for.cond.i.i226.i @@ -45,7 +45,7 @@ for.end.i.i.i: ; preds = %land.rhs.i.i.i, %fo br label %while.cond.backedge.i.i.i } -define void @test1() nounwind { +define void @test1(i1 %arg) nounwind { entry: %t = shl i32 undef, undef ; [#uses=1] %t9 = sub nsw i32 0, %t ; [#uses=1] @@ -59,7 +59,7 @@ outer: ; preds = %bb18, %bb inner: ; preds = %bb16, %bb11 %t17 = phi i32 [ %i13, %outer ], [ undef, %inner ] ; [#uses=1] store i32 %t17, ptr undef - br i1 undef, label %bb18, label %inner + br i1 %arg, label %bb18, label %inner bb18: ; preds = %bb16 %t19 = add i32 %i13, %t9 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/legalize-sub-zero-2.ll b/llvm/test/CodeGen/X86/legalize-sub-zero-2.ll index 2c84ca66463c2..1d00764beaa81 100644 --- a/llvm/test/CodeGen/X86/legalize-sub-zero-2.ll +++ b/llvm/test/CodeGen/X86/legalize-sub-zero-2.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin -define fastcc void @foo(i32 %type) nounwind optsize { +define fastcc void @foo(i32 %type, i1 %arg) nounwind optsize { entry: switch i32 %type, label %bb26 [ i32 33634, label %bb11 @@ -15,19 +15,19 @@ bb26: ; preds = %entry bb27: ; preds = %bb11, %entry %srcpb.0 = phi i32 [ 1, %bb11 ], [ 0, %entry ] - br i1 undef, label %bb348, label %bb30.lr.ph + br i1 %arg, label %bb348, label %bb30.lr.ph bb30.lr.ph: ; preds = %bb27 %.sum743 = shl i32 %srcpb.0, 1 %0 = mul i32 %srcpb.0, -2 %.sum745 = add i32 %.sum743, %0 - br i1 undef, label %bb70, label %bb71 + br i1 %arg, label %bb70, label %bb71 bb70: ; preds = %bb30.lr.ph unreachable bb71: ; preds = %bb30.lr.ph - br i1 undef, label %bb92, label %bb80 + br i1 %arg, label %bb92, label %bb80 bb80: ; preds = %bb71 unreachable diff --git a/llvm/test/CodeGen/X86/licm-symbol.ll b/llvm/test/CodeGen/X86/licm-symbol.ll index e6b3f34ee9bb6..4e33d000c56da 100644 --- a/llvm/test/CodeGen/X86/licm-symbol.ll +++ b/llvm/test/CodeGen/X86/licm-symbol.ll @@ -21,12 +21,12 @@ target triple = "i386-apple-darwin8" declare i32 @fprintf(ptr nocapture) nounwind -define void @gcov_exit() nounwind { +define void @gcov_exit(i1 %arg) nounwind { entry: br label %bb151 bb151: ; preds = %bb59, %bb56, %bb14 - br i1 undef, label %bb56, label %bb59 + br i1 %arg, label %bb56, label %bb59 bb56: ; preds = %bb151 %t0 = call i32 (ptr) @fprintf(ptr getelementptr inbounds ([0 x %struct.FILE], ptr @__sF, i32 0, i32 2)) nounwind diff --git a/llvm/test/CodeGen/X86/liveness-local-regalloc.ll b/llvm/test/CodeGen/X86/liveness-local-regalloc.ll index 68e2e24d13eaf..c4293ec42a578 100644 --- a/llvm/test/CodeGen/X86/liveness-local-regalloc.ll +++ b/llvm/test/CodeGen/X86/liveness-local-regalloc.ll @@ -9,7 +9,7 @@ declare fastcc i32 @func(ptr, i32, i32) nounwind ssp -define fastcc void @func2(ptr %arg, i32 %arg1) nounwind ssp { +define fastcc void @func2(ptr %arg, i32 %arg1, i1 %arg2) nounwind ssp { bb: br label %.exit3 @@ -20,7 +20,7 @@ bb: ] bb2: ; preds = %bb5, %bb3, %.exit3 - br i1 undef, label %bb3, label %bb5 + br i1 %arg2, label %bb3, label %bb5 bb3: ; preds = %bb2 switch i32 undef, label %infloop [ @@ -41,7 +41,7 @@ bb5: ; preds = %bb2 .loopexit: ; preds = %bb5, %bb4, %bb3, %.exit3 %.04 = phi i32 [ %tmp, %bb4 ], [ undef, %bb3 ], [ undef, %.exit3 ], [ undef, %bb5 ] ; [#uses=2] - br i1 undef, label %bb8, label %bb6 + br i1 %arg2, label %bb8, label %bb6 bb6: ; preds = %.loopexit %tmp7 = tail call fastcc i32 @func(ptr %arg, i32 %.04, i32 undef) nounwind ssp ; [#uses=0] diff --git a/llvm/test/CodeGen/X86/lsr-overflow.ll b/llvm/test/CodeGen/X86/lsr-overflow.ll index 79440c282be75..def57c542adc1 100644 --- a/llvm/test/CodeGen/X86/lsr-overflow.ll +++ b/llvm/test/CodeGen/X86/lsr-overflow.ll @@ -26,7 +26,7 @@ __ABContainsLabel.exit: ret i1 %cmp } -define void @func_37() noreturn nounwind readonly { +define void @func_37(i1 %arg) noreturn nounwind readonly { entry: br label %for.body @@ -34,7 +34,7 @@ for.body: ; preds = %for.inc8, %entry %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.inc8 ] %sub.i = add i64 undef, %indvar %cmp.i = icmp eq i64 %sub.i, -9223372036854775808 - br i1 undef, label %for.inc8, label %for.cond4 + br i1 %arg, label %for.inc8, label %for.cond4 for.cond4: ; preds = %for.cond4, %for.body br label %for.cond4 diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index a38ca339cd5e1..bb7ab4a666859 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -2596,15 +2596,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512F-NEXT: vmulps %ymm0, %ymm14, %ymm14 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm15, %ymm11, %ymm15 ; AVX512F-NEXT: vaddps %ymm15, %ymm14, %ymm14 ; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm1, %ymm15, %ymm15 ; AVX512F-NEXT: vaddps %ymm15, %ymm14, %ymm14 ; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm15, %ymm10, %ymm15 ; AVX512F-NEXT: vaddps %ymm15, %ymm14, %ymm14 ; AVX512F-NEXT: vextractf32x4 $3, %zmm4, %xmm4 @@ -2659,15 +2659,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm0, %ymm15, %ymm15 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm14[1,1,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12 ; AVX512F-NEXT: vmulps %ymm12, %ymm11, %ymm12 ; AVX512F-NEXT: vaddps %ymm12, %ymm15, %ymm12 ; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm1, %ymm15, %ymm15 ; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 ; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm14[3,3,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm15, %ymm10, %ymm15 ; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 ; AVX512F-NEXT: vextractf32x4 $3, %zmm5, %xmm5 @@ -2721,15 +2721,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 ; AVX512F-NEXT: vextractf64x4 $1, %zmm6, %ymm15 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm15[1,1,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm4, %ymm4 +; AVX512F-NEXT: vbroadcastss %xmm4, %ymm4 ; AVX512F-NEXT: vmulps %ymm4, %ymm11, %ymm4 ; AVX512F-NEXT: vaddps %ymm4, %ymm12, %ymm4 ; AVX512F-NEXT: vshufps {{.*#+}} xmm12 = xmm15[2,2,2,2] -; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12 ; AVX512F-NEXT: vmulps %ymm1, %ymm12, %ymm12 ; AVX512F-NEXT: vaddps %ymm4, %ymm12, %ymm4 ; AVX512F-NEXT: vshufps {{.*#+}} xmm12 = xmm15[3,3,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12 ; AVX512F-NEXT: vmulps %ymm12, %ymm10, %ymm12 ; AVX512F-NEXT: vaddps %ymm4, %ymm12, %ymm4 ; AVX512F-NEXT: vextractf32x4 $3, %zmm6, %xmm6 @@ -2786,15 +2786,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm0, %ymm15, %ymm0 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm15, %ymm11, %ymm11 ; AVX512F-NEXT: vaddps %ymm0, %ymm11, %ymm0 ; AVX512F-NEXT: vshufps {{.*#+}} xmm11 = xmm13[2,2,2,2] -; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11 +; AVX512F-NEXT: vbroadcastss %xmm11, %ymm11 ; AVX512F-NEXT: vmulps %ymm1, %ymm11, %ymm1 ; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm13[3,3,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512F-NEXT: vbroadcastss %xmm1, %ymm1 ; AVX512F-NEXT: vmulps %ymm1, %ymm10, %ymm1 ; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextractf32x4 $3, %zmm7, %xmm1 @@ -2860,15 +2860,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512VL-NEXT: vmulps %ymm0, %ymm14, %ymm14 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512VL-NEXT: vmulps %ymm15, %ymm11, %ymm15 ; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2] -; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512VL-NEXT: vmulps %ymm1, %ymm15, %ymm15 ; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512VL-NEXT: vmulps %ymm15, %ymm10, %ymm15 ; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 ; AVX512VL-NEXT: vextractf32x4 $3, %zmm4, %xmm4 @@ -2922,15 +2922,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512VL-NEXT: vmulps %ymm0, %ymm15, %ymm15 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm14[1,1,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 +; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 ; AVX512VL-NEXT: vmulps %ymm16, %ymm11, %ymm16 ; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm14[2,2,2,2] -; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 +; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 ; AVX512VL-NEXT: vmulps %ymm16, %ymm1, %ymm16 ; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm14[3,3,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 +; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 ; AVX512VL-NEXT: vmulps %ymm16, %ymm10, %ymm16 ; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 ; AVX512VL-NEXT: vextractf32x4 $3, %zmm5, %xmm5 @@ -2984,15 +2984,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 ; AVX512VL-NEXT: vmulps %ymm16, %ymm0, %ymm16 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm17 = xmm15[1,1,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm17, %ymm17 +; AVX512VL-NEXT: vbroadcastss %xmm17, %ymm17 ; AVX512VL-NEXT: vmulps %ymm17, %ymm11, %ymm17 ; AVX512VL-NEXT: vaddps %ymm17, %ymm16, %ymm16 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm17 = xmm15[2,2,2,2] -; AVX512VL-NEXT: vbroadcastsd %xmm17, %ymm17 +; AVX512VL-NEXT: vbroadcastss %xmm17, %ymm17 ; AVX512VL-NEXT: vmulps %ymm17, %ymm1, %ymm17 ; AVX512VL-NEXT: vaddps %ymm17, %ymm16, %ymm16 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm17 = xmm15[3,3,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm17, %ymm17 +; AVX512VL-NEXT: vbroadcastss %xmm17, %ymm17 ; AVX512VL-NEXT: vmulps %ymm17, %ymm10, %ymm17 ; AVX512VL-NEXT: vaddps %ymm17, %ymm16, %ymm16 ; AVX512VL-NEXT: vextractf32x4 $3, %zmm6, %xmm6 @@ -3046,15 +3046,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512VL-NEXT: vbroadcastss %xmm17, %ymm17 ; AVX512VL-NEXT: vmulps %ymm17, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm17 = xmm16[1,1,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm17, %ymm17 +; AVX512VL-NEXT: vbroadcastss %xmm17, %ymm17 ; AVX512VL-NEXT: vmulps %ymm17, %ymm11, %ymm11 ; AVX512VL-NEXT: vaddps %ymm0, %ymm11, %ymm0 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm11 = xmm16[2,2,2,2] -; AVX512VL-NEXT: vbroadcastsd %xmm11, %ymm11 +; AVX512VL-NEXT: vbroadcastss %xmm11, %ymm11 ; AVX512VL-NEXT: vmulps %ymm1, %ymm11, %ymm1 ; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm16[3,3,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512VL-NEXT: vbroadcastss %xmm1, %ymm1 ; AVX512VL-NEXT: vmulps %ymm1, %ymm10, %ymm1 ; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vextractf32x4 $3, %zmm7, %xmm1 diff --git a/llvm/test/CodeGen/X86/not-of-dec.ll b/llvm/test/CodeGen/X86/not-of-dec.ll index 9790649503123..29461619ac805 100644 --- a/llvm/test/CodeGen/X86/not-of-dec.ll +++ b/llvm/test/CodeGen/X86/not-of-dec.ll @@ -57,18 +57,17 @@ define i32 @t2_extrause(i32 %alignment, ptr %mask_storage) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal -1(%eax), %edx -; X86-NEXT: movl %edx, (%ecx) -; X86-NEXT: negl %eax +; X86-NEXT: decl %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: notl %eax ; X86-NEXT: retl ; ; X64-LABEL: t2_extrause: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: leal -1(%rax), %ecx -; X64-NEXT: movl %ecx, (%rsi) -; X64-NEXT: negl %eax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal -1(%rdi), %eax +; X64-NEXT: movl %eax, (%rsi) +; X64-NEXT: notl %eax ; X64-NEXT: retq %mask = add i32 %alignment, -1 store i32 %mask, ptr %mask_storage diff --git a/llvm/test/CodeGen/X86/pr51615.ll b/llvm/test/CodeGen/X86/pr51615.ll index a062aa138a1e5..b8dd3c196e22d 100644 --- a/llvm/test/CodeGen/X86/pr51615.ll +++ b/llvm/test/CodeGen/X86/pr51615.ll @@ -12,9 +12,9 @@ define void @volatile_load_2_elts() { ; AVX: # %bb.0: ; AVX-NEXT: vmovaps g0(%rip), %xmm0 ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX-NEXT: vmovaps %ymm0, (%rax) diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll index 74a691e520943..d0ea195671c8e 100644 --- a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll @@ -907,14 +907,14 @@ define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, < ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp -; X86-NEXT: vmovsh 8(%ebp), %xmm3 +; X86-NEXT: vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vcomish %xmm2, %xmm3 ; X86-NEXT: seta %al ; X86-NEXT: kmovd %eax, %k0 ; X86-NEXT: kshiftlb $7, %k0, %k0 ; X86-NEXT: kshiftrb $7, %k0, %k0 ; X86-NEXT: vpsrld $16, %xmm2, %xmm3 -; X86-NEXT: vmovsh 10(%ebp), %xmm4 +; X86-NEXT: vmovsh {{.*#+}} xmm4 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vcomish %xmm3, %xmm4 ; X86-NEXT: seta %al ; X86-NEXT: kmovd %eax, %k1 @@ -925,7 +925,7 @@ define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, < ; X86-NEXT: kmovd %eax, %k1 ; X86-NEXT: kandb %k1, %k0, %k0 ; X86-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X86-NEXT: vmovsh 12(%ebp), %xmm4 +; X86-NEXT: vmovsh {{.*#+}} xmm4 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vcomish %xmm3, %xmm4 ; X86-NEXT: seta %al ; X86-NEXT: kmovd %eax, %k1 @@ -936,7 +936,7 @@ define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, < ; X86-NEXT: kmovd %eax, %k1 ; X86-NEXT: kandb %k1, %k0, %k0 ; X86-NEXT: vpsrlq $48, %xmm2, %xmm2 -; X86-NEXT: vmovsh 14(%ebp), %xmm3 +; X86-NEXT: vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vcomish %xmm2, %xmm3 ; X86-NEXT: seta %al ; X86-NEXT: kmovd %eax, %k1 diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll index 0ec33ad800912..cf6edbc628503 100644 --- a/llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll @@ -35,25 +35,25 @@ define <8 x i32> @test_v8f64_oeq_q(<8 x i32> %a, <8 x i32> %b, <8 x double> %f1, } define <16 x i32> @test_v16f64_ogt(<16 x i32> %a, <16 x i32> %b, <16 x double> %f1, <16 x double> %f2) #0 { -; SKX-LABEL: test_v16f64_ogt +; SKX-LABEL: test_v16f64_ogt: ; SKX: # %bb.0: -; SKX-NEXT: pushq %rbp -; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: andq $-32, %rsp -; SKX-NEXT: subq $32, %rsp -; SKX-NEXT: vcmpgtpd 80(%rbp), %ymm6, %k0 -; SKX-NEXT: vcmpgtpd 112(%rbp), %ymm7, %k1 -; SKX-NEXT: kshiftlb $4, %k1, %k1 -; SKX-NEXT: korb %k1, %k0, %k1 -; SKX-NEXT: vcmpgtpd 16(%rbp), %ymm4, %k0 -; SKX-NEXT: vcmpgtpd 48(%rbp), %ymm5, %k2 -; SKX-NEXT: kshiftlb $4, %k2, %k2 -; SKX-NEXT: korb %k2, %k0, %k2 -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k2} -; SKX-NEXT: vpblendmd %ymm1, %ymm3, %ymm1 {%k1} -; SKX-NEXT: movq %rbp, %rsp -; SKX-NEXT: popq %rbp -; SKX-NEXT: retq +; SKX-NEXT: pushq %rbp +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: andq $-32, %rsp +; SKX-NEXT: subq $32, %rsp +; SKX-NEXT: vcmpgtpd 80(%rbp), %ymm6, %k0 +; SKX-NEXT: vcmpgtpd 112(%rbp), %ymm7, %k1 +; SKX-NEXT: kshiftlb $4, %k1, %k1 +; SKX-NEXT: korb %k1, %k0, %k1 +; SKX-NEXT: vcmpgtpd 16(%rbp), %ymm4, %k0 +; SKX-NEXT: vcmpgtpd 48(%rbp), %ymm5, %k2 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: korb %k2, %k0, %k2 +; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k2} +; SKX-NEXT: vpblendmd %ymm1, %ymm3, %ymm1 {%k1} +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: retq %cond = tail call <16 x i1> @llvm.experimental.constrained.fcmps.v16f64( <16 x double> %f1, <16 x double> %f2, metadata !"ogt", metadata !"fpexcept.maytrap") %res = select <16 x i1> %cond, <16 x i32> %a, <16 x i32> %b diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll index 0981c45e9d803..48a0b27a207f3 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -2338,7 +2338,7 @@ define <2 x i1> @strict_vector_fptosi_v2f64_to_v2i1(<2 x double> %a) #0 { ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} @@ -2563,7 +2563,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 { ; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} @@ -2695,7 +2695,7 @@ define <2 x i1> @strict_vector_fptosi_v2f32_to_v2i1(<2 x float> %a) #0 { ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $1, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} @@ -2941,7 +2941,7 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 { ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $1, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} @@ -3228,7 +3228,7 @@ define <4 x i1> @strict_vector_fptosi_v4f32_to_v4i1(<4 x float> %a) #0 { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} @@ -3282,7 +3282,7 @@ define <4 x i1> @strict_vector_fptoui_v4f32_to_v4i1(<4 x float> %a) #0 { ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll index cba3b09445148..179e8ad69672b 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -1281,7 +1281,7 @@ define <4 x i1> @strict_vector_fptosi_v4f64_to_v4i1(<4 x double> %a) #0 { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} @@ -1328,7 +1328,7 @@ define <4 x i1> @strict_vector_fptoui_v4f64_to_v4i1(<4 x double> %a) #0 { ; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0 ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} @@ -1601,7 +1601,7 @@ define <8 x i1> @strict_vector_fptosi_v8f32_to_v8i1(<8 x float> %a) #0 { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -1654,7 +1654,7 @@ define <8 x i1> @strict_vector_fptoui_v8f32_to_v8i1(<8 x float> %a) #0 { ; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 ; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll index ff00779d90e5c..ce5db5b246775 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -835,7 +835,7 @@ define <16 x i1> @strict_vector_fptosi_v16f32_to_v16i1(<16 x float> %a) #0 { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %zmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: ret{{[l|q]}} @@ -859,7 +859,7 @@ define <16 x i1> @strict_vector_fptoui_v16f32_to_v16i1(<16 x float> %a) #0 { ; AVX512VL-NEXT: vcvttps2dq %zmm0, %zmm0 ; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index 1105909699d4f..bdbe3c09e5782 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -3122,48 +3122,26 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { ; F16C-NEXT: addq $40, %rsp ; F16C-NEXT: retq ; -; AVX512F-LABEL: cvt_2f64_to_2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $104, %rsp -; AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512F-NEXT: callq __truncdfhf2@PLT -; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: callq __truncdfhf2@PLT -; AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512F-NEXT: # xmm0 = mem[1,0] -; AVX512F-NEXT: callq __truncdfhf2@PLT -; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [16,0] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: addq $104, %rsp -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512-FASTLANE-LABEL: cvt_2f64_to_2i16: -; AVX512-FASTLANE: # %bb.0: -; AVX512-FASTLANE-NEXT: subq $40, %rsp -; AVX512-FASTLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT -; AVX512-FASTLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-FASTLANE-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-FASTLANE-NEXT: # xmm0 = mem[1,0] -; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT -; AVX512-FASTLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-FASTLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT -; AVX512-FASTLANE-NEXT: vpbroadcastw %xmm0, %xmm1 -; AVX512-FASTLANE-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,0] -; AVX512-FASTLANE-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload -; AVX512-FASTLANE-NEXT: addq $40, %rsp -; AVX512-FASTLANE-NEXT: retq +; AVX512-LABEL: cvt_2f64_to_2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> ret <2 x i16> %2 @@ -3292,8 +3270,8 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -3424,8 +3402,8 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -4127,9 +4105,9 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] -; AVX512-NEXT: vmovaps %xmm0, (%rbx) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rbx) ; AVX512-NEXT: addq $64, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index ae3e5445bf266..d83a61e18d1ab 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -201,8 +201,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 @@ -260,8 +260,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 @@ -319,8 +319,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 @@ -378,8 +378,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] ; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index a79b109feec72..62f59e918f00c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -2403,7 +2403,7 @@ define <4 x float> @shuffle_mem_pmovzx_v4f32(ptr %p0, ptr %p1) { ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] ; AVX1-NEXT: vmovaps %xmm1, (%rsi) ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 2bb570521a304..a040d08728ccb 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -6039,8 +6039,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2 ; ; AVX2-SLOW-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] @@ -6057,8 +6057,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2 ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,2,3,2,3,8,9,10,11,14,15,u,u,18,19,18,19,18,19,18,19,24,25,26,27,30,31,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -6081,8 +6081,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2 ; ; XOPAVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] ; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 00af58544e25c..e4eeaeb3e1a6d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -403,13 +403,12 @@ define void @PR39483() { ; ; X86-AVX512-LABEL: PR39483: ; X86-AVX512: # %bb.0: # %entry -; X86-AVX512-NEXT: vmovups 0, %zmm0 -; X86-AVX512-NEXT: vmovups 64, %ymm1 -; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23] -; X86-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 -; X86-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X86-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1 -; X86-AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; X86-AVX512-NEXT: vmovups 64, %ymm0 +; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7] +; X86-AVX512-NEXT: vpermt2ps 0, %zmm1, %zmm0 +; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; X86-AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X86-AVX512-NEXT: vmovups %ymm0, (%eax) ; ; X64-AVX1-LABEL: PR39483: @@ -444,13 +443,12 @@ define void @PR39483() { ; ; X64-AVX512-LABEL: PR39483: ; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: vmovups 0, %zmm0 -; X64-AVX512-NEXT: vmovups 64, %ymm1 -; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23] -; X64-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 -; X64-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1 -; X64-AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; X64-AVX512-NEXT: vmovups 64, %ymm0 +; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7] +; X64-AVX512-NEXT: vpermt2ps 0, %zmm1, %zmm0 +; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vmovups %ymm0, (%rax) entry: %wide.vec = load <24 x float>, ptr null, align 4 diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index d07b7b574eba7..17315c436188a 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -377,14 +377,14 @@ define void @vselect_concat_splat() { ; AVX512-NEXT: vmovaps %ymm2, %ymm3 ; AVX512-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3 ; AVX512-NEXT: vmovups 32, %xmm4 -; AVX512-NEXT: vmovups 0, %ymm5 -; AVX512-NEXT: vxorps %xmm6, %xmm6, %xmm6 -; AVX512-NEXT: vcmpneqps %xmm6, %xmm3, %k0 +; AVX512-NEXT: vxorps %xmm5, %xmm5, %xmm5 +; AVX512-NEXT: vcmpneqps %xmm5, %xmm3, %k0 ; AVX512-NEXT: kshiftlw $4, %k0, %k1 ; AVX512-NEXT: korw %k1, %k0, %k1 -; AVX512-NEXT: vpermt2ps %ymm4, %ymm2, %ymm5 ; AVX512-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 -; AVX512-NEXT: vmovaps %ymm5, %ymm0 {%k1} +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,11,14,1,9,12,15,2] +; AVX512-NEXT: vpermi2ps 0, %ymm4, %ymm1 +; AVX512-NEXT: vmovaps %ymm1, %ymm0 {%k1} ; AVX512-NEXT: vmovups %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index ce092f9d343fc..c9b10d9cc8668 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1220,7 +1220,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -1234,7 +1234,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -1247,7 +1247,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -1259,10 +1259,9 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -1345,7 +1344,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -1359,7 +1358,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -1372,7 +1371,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -1384,10 +1383,9 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -1719,7 +1717,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1732,7 +1730,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -1745,7 +1743,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -2739,7 +2737,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm3 +; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm3 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero @@ -2758,7 +2756,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm3 +; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm3 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero @@ -2774,10 +2772,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} +; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 @@ -3033,7 +3030,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm3 +; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm3 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero @@ -3052,7 +3049,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm3 +; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm3 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero @@ -3068,10 +3065,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} +; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 @@ -3606,35 +3602,37 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3949,10 +3947,9 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -3968,7 +3965,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero @@ -3985,7 +3982,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero @@ -3998,14 +3995,14 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -4017,31 +4014,32 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm1, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -4053,18 +4051,19 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index acedcf4263906..5ba2257e2b49e 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -910,10 +910,8 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1010,8 +1008,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -1022,8 +1019,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; AVX512F-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -1032,8 +1029,8 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; AVX512DQ-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -1041,12 +1038,11 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; ; AVX512BW-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1108,8 +1104,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -1120,8 +1115,8 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; AVX512F-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -1130,8 +1125,8 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; AVX512DQ-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem)) +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -1139,12 +1134,11 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; ; AVX512BW-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1307,11 +1301,8 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper @@ -1397,8 +1388,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15] ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1407,7 +1397,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512F-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -1416,7 +1406,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512DQ-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -1675,10 +1665,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] -; AVX512F-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512F-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1694,10 +1683,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512DQ-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -1713,10 +1701,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7] -; AVX512BW-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15] +; AVX512BW-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -2173,7 +2160,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm3 +; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm3 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero @@ -2190,7 +2177,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm3 +; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm3 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero @@ -2205,10 +2192,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} +; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 @@ -2426,7 +2412,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm3 +; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm3 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero @@ -2443,7 +2429,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm3 +; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm3 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = mem & (ymm2 | ymm1) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero @@ -2458,10 +2444,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} +; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 @@ -2911,10 +2896,8 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2955,11 +2938,10 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512BW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,32,59,32,61,32,63,32,9,32,11,32,13,32,15,32,17,32,19,32,21,32,23,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3110,11 +3092,10 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,58,32,60,61,32,63,8,32,10,11,32,13,14,32,16,17,32,19,20,32,22,23,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3182,11 +3163,9 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 @@ -3200,15 +3179,14 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3216,32 +3194,29 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -3250,7 +3225,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -3262,18 +3237,16 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -3282,7 +3255,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -3294,11 +3267,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [32,57,58,59,32,61,62,63,32,9,10,11,32,13,14,15,32,17,18,19,32,21,22,23,0,0,0,0,0,0,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3808,13 +3780,12 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,16,31,16,5,16,7,16,9,16,11,0,0,0,0] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper @@ -3822,13 +3793,12 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,16,31,16,5,16,7,16,9,16,11,0,0,0,0] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -3836,11 +3806,10 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512BW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,16,31,16,5,16,7,16,9,16,11,0,0,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3962,13 +3931,12 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,16,4,5,16,7,8,16,10,11,0,0,0,0] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper @@ -3976,13 +3944,12 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,16,4,5,16,7,8,16,10,11,0,0,0,0] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -3990,11 +3957,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,16,4,5,16,7,8,16,10,11,0,0,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4044,18 +4010,17 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7] ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -4075,13 +4040,12 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512F-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,16,5,6,7,16,9,10,11,0,0,0,0] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper @@ -4089,13 +4053,12 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,16,5,6,7,16,9,10,11,0,0,0,0] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -4103,11 +4066,10 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [16,29,30,31,16,5,6,7,16,9,10,11,0,0,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,0,0,0,0] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4263,17 +4225,16 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -4293,13 +4254,12 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512F-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,8,3,8,5,0,0] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper @@ -4307,13 +4267,12 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,8,3,8,5,0,0] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -4321,11 +4280,10 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,8,3,8,5,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4399,11 +4357,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,2,8,0,0,0,0] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -4412,11 +4369,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,2,8,0,0,0,0] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,0,0,0,0] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -4425,11 +4381,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512BW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [8,15,2,8,4,5,0,0] ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,10,0,12,13,0,0] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/XCore/2010-02-25-LSR-Crash.ll b/llvm/test/CodeGen/XCore/2010-02-25-LSR-Crash.ll index e849cb34d0f43..c9fa0b5775b4e 100644 --- a/llvm/test/CodeGen/XCore/2010-02-25-LSR-Crash.ll +++ b/llvm/test/CodeGen/XCore/2010-02-25-LSR-Crash.ll @@ -7,9 +7,9 @@ target triple = "xcore-xmos-elf" %struct.object = type { ptr, ptr, ptr, %union.anon, %0, ptr } %union.anon = type { ptr } -define ptr @search_object(ptr %ob, ptr %pc) { +define ptr @search_object(ptr %ob, ptr %pc, i1 %arg) { entry: - br i1 undef, label %bb3.i15.i.i, label %bb2 + br i1 %arg, label %bb3.i15.i.i, label %bb2 bb3.i15.i.i: ; preds = %bb3.i15.i.i, %entry %indvar.i.i.i = phi i32 [ %indvar.next.i.i.i, %bb3.i15.i.i ], [ 0, %entry ] ; [#uses=2] diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll index 0ef94235fd515..4dbf3bfedffc7 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll @@ -2,8 +2,12 @@ ; RUN: opt < %s -passes='require,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-percentile-cutoff-hot=990000 2>&1 | FileCheck %s --check-prefix=NONE ; RUN: opt < %s -passes='require,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=1.0 2>&1 | FileCheck %s --check-prefix=ALL ; RUN: opt < %s -passes='require,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=0.0 2>&1 | FileCheck %s --check-prefix=NONE +; +; Skip check if either or both of the skip conditions (-hwasan-random-rate=0.0 or -hwasan-percentile-cutoff-hot=990000) is met. +; RUN: opt < %s -passes='require,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=1.0 -hwasan-percentile-cutoff-hot=700000 2>&1 | FileCheck %s --check-prefix=ALL ; RUN: opt < %s -passes='require,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=1.0 -hwasan-percentile-cutoff-hot=990000 2>&1 | FileCheck %s --check-prefix=NONE ; RUN: opt < %s -passes='require,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=0.0 -hwasan-percentile-cutoff-hot=700000 2>&1 | FileCheck %s --check-prefix=NONE +; RUN: opt < %s -passes='require,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=0.0 -hwasan-percentile-cutoff-hot=990000 2>&1 | FileCheck %s --check-prefix=NONE ; ALL: remark: :0:0: Sanitized: F=sanitize ; ALL: @sanitized diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-cvt.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-cvt.ll index 016e0114c83ff..2518443dc5910 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-cvt.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-cvt.ll @@ -19,13 +19,9 @@ define i32 @fcvtas_1w1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %A) @@ -38,13 +34,9 @@ define i64 @fcvtas_1x1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A) @@ -57,13 +49,9 @@ define i32 @fcvtas_1w1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double %A) @@ -76,13 +64,9 @@ define i64 @fcvtas_1x1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %A) @@ -100,13 +84,9 @@ define i32 @fcvtau_1w1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %A) @@ -119,13 +99,9 @@ define i64 @fcvtau_1x1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float %A) @@ -138,13 +114,9 @@ define i32 @fcvtau_1w1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double %A) @@ -157,13 +129,9 @@ define i64 @fcvtau_1x1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %A) @@ -181,13 +149,9 @@ define i32 @fcvtms_1w1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %A) @@ -200,13 +164,9 @@ define i64 @fcvtms_1x1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float %A) @@ -219,13 +179,9 @@ define i32 @fcvtms_1w1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double %A) @@ -238,13 +194,9 @@ define i64 @fcvtms_1x1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %A) @@ -262,13 +214,9 @@ define i32 @fcvtmu_1w1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %A) @@ -281,13 +229,9 @@ define i64 @fcvtmu_1x1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float %A) @@ -300,13 +244,9 @@ define i32 @fcvtmu_1w1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double %A) @@ -319,13 +259,9 @@ define i64 @fcvtmu_1x1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %A) @@ -343,13 +279,9 @@ define i32 @fcvtns_1w1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %A) @@ -362,13 +294,9 @@ define i64 @fcvtns_1x1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float %A) @@ -381,13 +309,9 @@ define i32 @fcvtns_1w1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double %A) @@ -400,13 +324,9 @@ define i64 @fcvtns_1x1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %A) @@ -424,13 +344,9 @@ define i32 @fcvtnu_1w1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %A) @@ -443,13 +359,9 @@ define i64 @fcvtnu_1x1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float %A) @@ -462,13 +374,9 @@ define i32 @fcvtnu_1w1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double %A) @@ -481,13 +389,9 @@ define i64 @fcvtnu_1x1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %A) @@ -505,13 +409,9 @@ define i32 @fcvtps_1w1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %A) @@ -524,13 +424,9 @@ define i64 @fcvtps_1x1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float %A) @@ -543,13 +439,9 @@ define i32 @fcvtps_1w1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double %A) @@ -562,13 +454,9 @@ define i64 @fcvtps_1x1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %A) @@ -586,13 +474,9 @@ define i32 @fcvtpu_1w1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %A) @@ -605,13 +489,9 @@ define i64 @fcvtpu_1x1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float %A) @@ -624,13 +504,9 @@ define i32 @fcvtpu_1w1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double %A) @@ -643,13 +519,9 @@ define i64 @fcvtpu_1x1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %A) @@ -667,13 +539,9 @@ define i32 @fcvtzs_1w1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %A) @@ -686,13 +554,9 @@ define i64 @fcvtzs_1x1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %A) @@ -705,13 +569,9 @@ define i32 @fcvtzs_1w1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double %A) @@ -724,13 +584,9 @@ define i64 @fcvtzs_1x1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %A) @@ -748,13 +604,9 @@ define i32 @fcvtzu_1w1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %A) @@ -767,13 +619,9 @@ define i64 @fcvtzu_1x1s(float %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float %A) @@ -786,13 +634,9 @@ define i32 @fcvtzu_1w1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double [[A]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %tmpvar3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double %A) @@ -805,13 +649,9 @@ define i64 @fcvtzu_1x1d(double %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 2: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: 3: +; CHECK-NEXT: [[TMP4:%.*]] = sext i1 [[_MSCMP]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double [[A]]) -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %tmpvar3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %A) @@ -824,6 +664,3 @@ declare i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double) nounwind readnone declare i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double) nounwind readnone attributes #0 = { sanitize_memory } -;. -; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} -;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vadd.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vadd.ll index f9b223dc420b9..ad0856d38c1e9 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vadd.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vadd.ll @@ -1601,7 +1601,7 @@ define <4 x i16> @saddlp4h(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1611,15 +1611,12 @@ define <4 x i16> @saddlp4h(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[_MSLD]] to i64 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i8> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[TMPVAR1]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[TMP3]] ; %tmpvar1 = load <8 x i8>, ptr %A @@ -1633,7 +1630,7 @@ define <2 x i32> @saddlp2s(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1643,15 +1640,12 @@ define <2 x i32> @saddlp2s(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[_MSLD]] to i64 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i16> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <2 x i16> [[TMP9]] to <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[TMPVAR1]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %tmpvar1 = load <4 x i16>, ptr %A @@ -1665,7 +1659,7 @@ define <1 x i64> @saddlp1d(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1675,15 +1669,12 @@ define <1 x i64> @saddlp1d(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[_MSLD]] to i64 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> poison, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> poison, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <1 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <1 x i32> [[TMP9]] to <1 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[TMPVAR1]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMP3]] ; %tmpvar1 = load <2 x i32>, ptr %A @@ -1697,7 +1688,7 @@ define <8 x i16> @saddlp8h(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1707,15 +1698,12 @@ define <8 x i16> @saddlp8h(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[_MSLD]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[_MSLD]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i8> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <8 x i8> [[TMP9]] to <8 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[TMPVAR1]]) -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i16> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i16> [[TMP3]] ; %tmpvar1 = load <16 x i8>, ptr %A @@ -1729,7 +1717,7 @@ define <4 x i32> @saddlp4s(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1739,15 +1727,12 @@ define <4 x i32> @saddlp4s(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i16> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i16> [[TMP9]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[TMPVAR1]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP3]] ; %tmpvar1 = load <8 x i16>, ptr %A @@ -1761,7 +1746,7 @@ define <2 x i64> @saddlp2d(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1771,15 +1756,12 @@ define <2 x i64> @saddlp2d(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <2 x i32> [[TMP9]] to <2 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[TMPVAR1]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMP3]] ; %tmpvar1 = load <4 x i32>, ptr %A @@ -1801,7 +1783,7 @@ define <4 x i16> @uaddlp4h(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1811,15 +1793,12 @@ define <4 x i16> @uaddlp4h(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[_MSLD]] to i64 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i8> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[TMPVAR1]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[TMP3]] ; %tmpvar1 = load <8 x i8>, ptr %A @@ -1833,7 +1812,7 @@ define <2 x i32> @uaddlp2s(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1843,15 +1822,12 @@ define <2 x i32> @uaddlp2s(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[_MSLD]] to i64 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i16> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <2 x i16> [[TMP9]] to <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[TMPVAR1]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %tmpvar1 = load <4 x i16>, ptr %A @@ -1865,7 +1841,7 @@ define <1 x i64> @uaddlp1d(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1875,15 +1851,12 @@ define <1 x i64> @uaddlp1d(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[_MSLD]] to i64 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> poison, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> poison, <1 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <1 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <1 x i32> [[TMP9]] to <1 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[TMPVAR1]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMP3]] ; %tmpvar1 = load <2 x i32>, ptr %A @@ -1897,7 +1870,7 @@ define <8 x i16> @uaddlp8h(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1907,15 +1880,12 @@ define <8 x i16> @uaddlp8h(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[_MSLD]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[_MSLD]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <8 x i8> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <8 x i8> [[TMP9]] to <8 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[TMPVAR1]]) -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i16> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i16> [[TMP3]] ; %tmpvar1 = load <16 x i8>, ptr %A @@ -1929,7 +1899,7 @@ define <4 x i32> @uaddlp4s(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1939,15 +1909,12 @@ define <4 x i32> @uaddlp4s(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i16> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i16> [[TMP9]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[TMPVAR1]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP3]] ; %tmpvar1 = load <8 x i16>, ptr %A @@ -1961,7 +1928,7 @@ define <2 x i64> @uaddlp2d(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP11:%.*]], !prof [[PROF1]] ; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -1971,15 +1938,12 @@ define <2 x i64> @uaddlp2d(ptr %A) nounwind #0 { ; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576 ; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 -; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 8: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 9: +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = zext <2 x i32> [[TMP9]] to <2 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[TMPVAR1]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP10]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMP3]] ; %tmpvar1 = load <4 x i32>, ptr %A @@ -2012,26 +1976,23 @@ define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[_MSLD]] to i64 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i8> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[TMPVAR1]]) ; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP12:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 12: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK: 13: ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[B]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576 ; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP15]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i16> zeroinitializer, [[_MSLD1]] +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i16> [[TMP11]], [[_MSLD1]] ; CHECK-NEXT: [[TMPVAR5:%.*]] = add <4 x i16> [[TMPVAR3]], [[TMP4]] ; CHECK-NEXT: store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[TMPVAR5]] @@ -2060,26 +2021,23 @@ define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[_MSLD]] to i64 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i16> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[TMPVAR1]]) ; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP12:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 12: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK: 13: ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[B]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576 ; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP15]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i32> zeroinitializer, [[_MSLD1]] +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i32> [[TMP11]], [[_MSLD1]] ; CHECK-NEXT: [[TMPVAR5:%.*]] = add <2 x i32> [[TMPVAR3]], [[TMP4]] ; CHECK-NEXT: store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR5]] @@ -2108,26 +2066,23 @@ define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[_MSLD]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[_MSLD]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i8> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = zext <8 x i8> [[TMP10]] to <8 x i16> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[TMPVAR1]]) ; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP12:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 12: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK: 13: ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[B]], align 16 ; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576 ; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP15]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> zeroinitializer, [[_MSLD1]] +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP11]], [[_MSLD1]] ; CHECK-NEXT: [[TMPVAR5:%.*]] = add <8 x i16> [[TMPVAR3]], [[TMP4]] ; CHECK-NEXT: store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i16> [[TMPVAR5]] @@ -2156,26 +2111,23 @@ define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i16> [[TMP10]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[TMPVAR1]]) ; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP12:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 12: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK: 13: ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[B]], align 16 ; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576 ; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP15]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[_MSLD1]] +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP11]], [[_MSLD1]] ; CHECK-NEXT: [[TMPVAR5:%.*]] = add <4 x i32> [[TMPVAR3]], [[TMP4]] ; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR5]] @@ -2204,26 +2156,23 @@ define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i32> [[TMP10]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[TMPVAR1]]) ; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP12:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 12: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK: 13: ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[B]], align 16 ; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576 ; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[_MSLD1]] +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP11]], [[_MSLD1]] ; CHECK-NEXT: [[TMPVAR5:%.*]] = add <2 x i64> [[TMPVAR3]], [[TMP4]] ; CHECK-NEXT: store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR5]] @@ -2252,26 +2201,23 @@ define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[_MSLD]] to i64 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i8> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[TMPVAR1]]) ; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP12:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 12: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK: 13: ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[B]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576 ; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP15]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i16> zeroinitializer, [[_MSLD1]] +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i16> [[TMP11]], [[_MSLD1]] ; CHECK-NEXT: [[TMPVAR5:%.*]] = add <4 x i16> [[TMPVAR3]], [[TMP4]] ; CHECK-NEXT: store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[TMPVAR5]] @@ -2300,26 +2246,23 @@ define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[_MSLD]] to i64 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i16> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[TMPVAR1]]) ; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP12:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 12: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK: 13: ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[B]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576 ; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP15]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i32> zeroinitializer, [[_MSLD1]] +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i32> [[TMP11]], [[_MSLD1]] ; CHECK-NEXT: [[TMPVAR5:%.*]] = add <2 x i32> [[TMPVAR3]], [[TMP4]] ; CHECK-NEXT: store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR5]] @@ -2348,26 +2291,23 @@ define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[_MSLD]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[_MSLD]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i8> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = zext <8 x i8> [[TMP10]] to <8 x i16> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[TMPVAR1]]) ; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP12:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 12: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK: 13: ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[B]], align 16 ; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576 ; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP15]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> zeroinitializer, [[_MSLD1]] +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP11]], [[_MSLD1]] ; CHECK-NEXT: [[TMPVAR5:%.*]] = add <8 x i16> [[TMPVAR3]], [[TMP4]] ; CHECK-NEXT: store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i16> [[TMPVAR5]] @@ -2396,26 +2336,23 @@ define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i16> [[TMP10]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[TMPVAR1]]) ; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP12:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 12: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK: 13: ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[B]], align 16 ; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576 ; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP15]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[_MSLD1]] +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP11]], [[_MSLD1]] ; CHECK-NEXT: [[TMPVAR5:%.*]] = add <4 x i32> [[TMPVAR3]], [[TMP4]] ; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR5]] @@ -2444,26 +2381,23 @@ define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] -; CHECK: 9: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 10: +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = or <2 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i32> [[TMP10]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[TMPVAR1]]) ; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 11: +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP12:%.*]], label [[TMP17:%.*]], !prof [[PROF1]] +; CHECK: 12: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable -; CHECK: 12: +; CHECK: 13: ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[B]], align 16 ; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-NEXT: [[TMP14:%.*]] = xor i64 [[TMP13]], 193514046488576 ; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[_MSLD1]] +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP11]], [[_MSLD1]] ; CHECK-NEXT: [[TMPVAR5:%.*]] = add <2 x i64> [[TMPVAR3]], [[TMP4]] ; CHECK-NEXT: store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR5]] @@ -2482,7 +2416,7 @@ define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -2503,7 +2437,9 @@ define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576 ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[_MSLD1]], <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[_MSLD1]], <8 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i8> [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[TMPVAR1]], <8 x i8> [[TMPVAR2]]) ; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[TMP3]] @@ -2521,7 +2457,7 @@ define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -2542,7 +2478,9 @@ define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576 ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[_MSLD]], <16 x i8> [[_MSLD1]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[_MSLD]], <16 x i8> [[_MSLD1]], <16 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> [[TMPVAR1]], <16 x i8> [[TMPVAR2]]) ; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[TMP3]] @@ -2560,7 +2498,7 @@ define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -2581,7 +2519,9 @@ define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576 ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[_MSLD1]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[_MSLD1]], <4 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i16> [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[TMPVAR1]], <4 x i16> [[TMPVAR2]]) ; CHECK-NEXT: store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[TMP3]] @@ -2599,7 +2539,7 @@ define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -2620,7 +2560,9 @@ define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576 ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> [[_MSLD1]], <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> [[_MSLD1]], <8 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[TMPVAR1]], <8 x i16> [[TMPVAR2]]) ; CHECK-NEXT: store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i16> [[TMP3]] @@ -2638,7 +2580,7 @@ define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -2659,7 +2601,9 @@ define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576 ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[_MSLD1]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[_MSLD1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i32> [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[TMPVAR1]], <2 x i32> [[TMPVAR2]]) ; CHECK-NEXT: store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMP3]] @@ -2677,7 +2621,7 @@ define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -2698,7 +2642,9 @@ define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576 ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD1]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD1]], <4 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[TMPVAR1]], <4 x i32> [[TMPVAR2]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP3]] @@ -2716,7 +2662,7 @@ define <2 x i64> @addp_2d(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -2737,7 +2683,9 @@ define <2 x i64> @addp_2d(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576 ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[_MSLD]], <2 x i64> [[_MSLD1]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[_MSLD]], <2 x i64> [[_MSLD1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[TMPVAR1]], <2 x i64> [[TMPVAR2]]) ; CHECK-NEXT: store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMP3]] @@ -2763,7 +2711,7 @@ define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -2784,7 +2732,9 @@ define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576 ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[_MSLD1]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[_MSLD1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i32> [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> [[TMPVAR1]], <2 x float> [[TMPVAR2]]) ; CHECK-NEXT: store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x float> [[TMP3]] @@ -2802,7 +2752,7 @@ define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -2823,7 +2773,9 @@ define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576 ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD1]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD1]], <4 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> [[TMPVAR1]], <4 x float> [[TMPVAR2]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[TMP3]] @@ -2841,7 +2793,7 @@ define <2 x double> @faddp_2d(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable @@ -2862,7 +2814,9 @@ define <2 x double> @faddp_2d(ptr %A, ptr %B) nounwind #0 { ; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576 ; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr ; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[_MSLD]], <2 x i64> [[_MSLD1]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[_MSLD]], <2 x i64> [[_MSLD1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> [[TMPVAR1]], <2 x double> [[TMPVAR2]]) ; CHECK-NEXT: store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x double> [[TMP3]] diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll index 2f49fde82c1af..48fe06728b71c 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll @@ -21,15 +21,10 @@ define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %A) @@ -41,15 +36,10 @@ define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> [[A]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %A) @@ -61,15 +51,10 @@ define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %A) @@ -81,15 +66,10 @@ define <1 x i64> @fcvtas_1d(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> [[A]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %A) @@ -106,15 +86,10 @@ define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %A) @@ -126,15 +101,10 @@ define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> [[A]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %A) @@ -146,15 +116,10 @@ define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %A) @@ -166,15 +131,10 @@ define <1 x i64> @fcvtau_1d(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> [[A]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %A) @@ -191,15 +151,10 @@ define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %A) @@ -211,15 +166,10 @@ define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> [[A]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %A) @@ -231,15 +181,10 @@ define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %A) @@ -251,15 +196,10 @@ define <1 x i64> @fcvtms_1d(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> [[A]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %A) @@ -276,15 +216,10 @@ define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A) @@ -296,15 +231,10 @@ define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> [[A]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A) @@ -316,15 +246,10 @@ define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A) @@ -336,15 +261,10 @@ define <1 x i64> @fcvtmu_1d(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> [[A]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %A) @@ -361,15 +281,10 @@ define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %A) @@ -381,15 +296,10 @@ define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> [[A]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %A) @@ -401,15 +311,10 @@ define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %A) @@ -421,15 +326,10 @@ define <1 x i64> @fcvtps_1d(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> [[A]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %A) @@ -446,15 +346,10 @@ define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A) @@ -466,15 +361,10 @@ define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> [[A]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A) @@ -486,15 +376,10 @@ define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A) @@ -506,15 +391,10 @@ define <1 x i64> @fcvtpu_1d(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> [[A]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %A) @@ -531,15 +411,10 @@ define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %A) @@ -551,15 +426,10 @@ define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> [[A]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %A) @@ -571,15 +441,10 @@ define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %A) @@ -591,15 +456,10 @@ define <1 x i64> @fcvtns_1d(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> [[A]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %A) @@ -616,15 +476,10 @@ define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A) @@ -636,15 +491,10 @@ define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> [[A]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A) @@ -656,15 +506,10 @@ define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A) @@ -676,15 +521,10 @@ define <1 x i64> @fcvtnu_1d(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> [[A]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %A) @@ -753,15 +593,10 @@ define <2 x i32> @fcvtzs_2s_intrinsic(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> %A) @@ -773,15 +608,10 @@ define <4 x i32> @fcvtzs_4s_intrinsic(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> [[A]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %A) @@ -793,15 +623,10 @@ define <2 x i64> @fcvtzs_2d_intrinsic(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> %A) @@ -813,15 +638,10 @@ define <1 x i64> @fcvtzs_1d_intrinsic(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[A]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %A) @@ -890,15 +710,10 @@ define <2 x i32> @fcvtzu_2s_intrinsic(<2 x float> %A) nounwind #0 { ; CHECK-SAME: <2 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMPVAR3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> %A) @@ -910,15 +725,10 @@ define <4 x i32> @fcvtzu_4s_intrinsic(<4 x float> %A) nounwind #0 { ; CHECK-SAME: <4 x float> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[A]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMPVAR3]] ; %tmpvar3 = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %A) @@ -930,15 +740,10 @@ define <2 x i64> @fcvtzu_2d_intrinsic(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i64> [[TMPVAR3]] ; %tmpvar3 = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> %A) @@ -950,15 +755,10 @@ define <1 x i64> @fcvtzu_1d_intrinsic(<1 x double> %A) nounwind #0 { ; CHECK-SAME: <1 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <1 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[A]]) -; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMPVAR3]] ; %tmpvar3 = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %A) @@ -1276,15 +1076,10 @@ define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind #0 { ; CHECK-SAME: <2 x double> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x float> [[TMPVAR3]] ; %tmpvar3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A) @@ -1297,15 +1092,10 @@ define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] -; CHECK-NEXT: unreachable -; CHECK: 5: +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[A]]) -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x float> [[RET]], <2 x float> [[TMPVAR3]], <4 x i32> ; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[RES]] @@ -1324,9 +1114,9 @@ define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]] ; CHECK-NEXT: unreachable ; CHECK: 4: ; CHECK-NEXT: [[TMPVAR3:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[A]], i32 1) diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmovn.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmovn.ll index 5ecedf61871d8..8e9110fa836c7 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmovn.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmovn.ll @@ -2,11 +2,6 @@ ; RUN: opt < %s -passes=msan -S | FileCheck %s ; ; Forked from llvm/test/CodeGen/AArch64/arm64-vmovn.ll -; -; Not correctly handled (by visitInstruction): -; - llvm.aarch64.neon.sqxtn -; - llvm.aarch64.neon.sqxtun -; - llvm.aarch64.neon.uqxtn target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android9001" @@ -112,15 +107,10 @@ define <8 x i8> @sqxtn8b(<8 x i16> %A) nounwind #0 { ; CHECK-SAME: <8 x i16> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1:![0-9]+]] -; CHECK: [[BB3]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]] -; CHECK-NEXT: unreachable -; CHECK: [[BB4]]: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[A]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[TMP3]] ; %tmpvar3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A) @@ -132,15 +122,10 @@ define <4 x i16> @sqxtn4h(<4 x i32> %A) nounwind #0 { ; CHECK-SAME: <4 x i32> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] -; CHECK: [[BB3]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB4]]: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[A]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[TMP3]] ; %tmpvar3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A) @@ -152,15 +137,10 @@ define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind #0 { ; CHECK-SAME: <2 x i64> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] -; CHECK: [[BB3]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB4]]: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[_MSPROP]] to <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A) @@ -173,15 +153,10 @@ define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] -; CHECK: [[BB4]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB5]]: +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <8 x i16> [[_MSPROP1]] to <8 x i8> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[A]]) -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> zeroinitializer, <16 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP5]], <16 x i32> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <8 x i8> [[RET]], <8 x i8> [[TMP3]], <16 x i32> ; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[RES]] @@ -197,15 +172,10 @@ define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] -; CHECK: [[BB4]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB5]]: +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i32> [[_MSPROP1]] to <4 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[A]]) -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP5]], <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i16> [[RET]], <4 x i16> [[TMP3]], <8 x i32> ; CHECK-NEXT: store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i16> [[RES]] @@ -221,15 +191,10 @@ define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] -; CHECK: [[BB4]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB5]]: +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[_MSPROP1]] to <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[A]]) -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x i32> [[RET]], <2 x i32> [[TMP3]], <4 x i32> ; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] @@ -248,15 +213,10 @@ define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind #0 { ; CHECK-SAME: <8 x i16> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] -; CHECK: [[BB3]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB4]]: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[A]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[TMP3]] ; %tmpvar3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A) @@ -268,15 +228,10 @@ define <4 x i16> @uqxtn4h(<4 x i32> %A) nounwind #0 { ; CHECK-SAME: <4 x i32> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] -; CHECK: [[BB3]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB4]]: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[A]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[TMP3]] ; %tmpvar3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A) @@ -288,15 +243,10 @@ define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind #0 { ; CHECK-SAME: <2 x i64> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] -; CHECK: [[BB3]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB4]]: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[_MSPROP]] to <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A) @@ -309,15 +259,10 @@ define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] -; CHECK: [[BB4]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB5]]: +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <8 x i16> [[_MSPROP1]] to <8 x i8> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[A]]) -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> zeroinitializer, <16 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP5]], <16 x i32> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <8 x i8> [[RET]], <8 x i8> [[TMP3]], <16 x i32> ; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[RES]] @@ -333,15 +278,10 @@ define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] -; CHECK: [[BB4]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB5]]: +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i32> [[_MSPROP1]] to <4 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[A]]) -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP5]], <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i16> [[RET]], <4 x i16> [[TMP3]], <8 x i32> ; CHECK-NEXT: store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i16> [[RES]] @@ -357,15 +297,10 @@ define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] -; CHECK: [[BB4]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB5]]: +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[_MSPROP1]] to <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[A]]) -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x i32> [[RET]], <2 x i32> [[TMP3]], <4 x i32> ; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] @@ -384,15 +319,10 @@ define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind #0 { ; CHECK-SAME: <8 x i16> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] -; CHECK: [[BB3]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB4]]: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[A]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[TMP3]] ; %tmpvar3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A) @@ -404,15 +334,10 @@ define <4 x i16> @sqxtun4h(<4 x i32> %A) nounwind #0 { ; CHECK-SAME: <4 x i32> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] -; CHECK: [[BB3]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB4]]: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[A]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[TMP3]] ; %tmpvar3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A) @@ -424,15 +349,10 @@ define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind #0 { ; CHECK-SAME: <2 x i64> [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] -; CHECK: [[BB3]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB4]]: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[_MSPROP]] to <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[A]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[TMP3]] ; %tmpvar3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A) @@ -445,15 +365,10 @@ define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] -; CHECK: [[BB4]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB5]]: +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <8 x i16> [[_MSPROP1]] to <8 x i8> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[A]]) -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> zeroinitializer, <16 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP5]], <16 x i32> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <8 x i8> [[RET]], <8 x i8> [[TMP3]], <16 x i32> ; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[RES]] @@ -469,15 +384,10 @@ define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] -; CHECK: [[BB4]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB5]]: +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i32> [[_MSPROP1]] to <4 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[A]]) -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP5]], <8 x i32> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i16> [[RET]], <4 x i16> [[TMP3]], <8 x i32> ; CHECK-NEXT: store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i16> [[RES]] @@ -493,15 +403,10 @@ define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] -; CHECK: [[BB4]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] -; CHECK-NEXT: unreachable -; CHECK: [[BB5]]: +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i64> [[_MSPROP1]] to <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[A]]) -; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> zeroinitializer, <4 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x i32> [[RET]], <2 x i32> [[TMP3]], <4 x i32> ; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] @@ -516,6 +421,3 @@ declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone declare <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone attributes #0 = { sanitize_memory } -;. -; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} -;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll index 8a1e3551b2741..7fa9b412b0f03 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll @@ -9405,17 +9405,19 @@ define void @sqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP10]], [[TMP4]] ; CHECK-NEXT: [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] ; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr @@ -9438,17 +9440,19 @@ define void @uqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP10]], [[TMP4]] ; CHECK-NEXT: [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] ; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr @@ -9471,17 +9475,19 @@ define void @srshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP10]], [[TMP4]] ; CHECK-NEXT: [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] ; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr @@ -9504,17 +9510,19 @@ define void @urshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP10]], [[TMP4]] ; CHECK-NEXT: [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] ; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr @@ -9537,17 +9545,19 @@ define void @sqshlu_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sani ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP10]], [[TMP4]] ; CHECK-NEXT: [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] ; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr @@ -9570,17 +9580,19 @@ define void @sshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) saniti ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP10]], [[TMP4]] ; CHECK-NEXT: [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] ; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr @@ -9603,17 +9615,19 @@ define void @ushl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) saniti ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP10]], [[TMP4]] ; CHECK-NEXT: [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer) ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]] -; CHECK: 5: +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] ; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK: 9: ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/qshrn.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/qshrn.ll index f35178a30dc14..7e70a18ed5b59 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/qshrn.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/qshrn.ll @@ -4,7 +4,6 @@ ; Forked from llvm/test/CodeGen/AArch64/qshrn.ll ; ; Heuristically (but correctly) handled: llvm.smax, llvm.smin, llvm.umin -; Incorrectly handled (handleUnknownInstruction): llvm.aarch64.neon.sqxtn, llvm.aarch64.neon.sqxtun, llvm.aarch64.neon.uqxtn target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android9001" @@ -17,15 +16,10 @@ define <4 x i16> @NarrowAShrI32By5(<4 x i32> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <4 x i32> [[X]], splat (i32 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1:![0-9]+]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[R:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[S]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[R]] ; %s = ashr <4 x i32> %x, @@ -41,15 +35,10 @@ define <4 x i16> @NarrowAShrU32By5(<4 x i32> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <4 x i32> [[X]], splat (i32 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[R:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[S]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[R]] ; %s = ashr <4 x i32> %x, @@ -65,15 +54,10 @@ define <4 x i16> @NarrowAShrI32By5ToU16(<4 x i32> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <4 x i32> [[X]], splat (i32 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[R:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[S]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[R]] ; %s = ashr <4 x i32> %x, @@ -89,15 +73,10 @@ define <4 x i16> @NarrowLShrI32By5(<4 x i32> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = lshr <4 x i32> [[X]], splat (i32 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[R:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[S]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[R]] ; %s = lshr <4 x i32> %x, @@ -113,15 +92,10 @@ define <4 x i16> @NarrowLShrU32By5(<4 x i32> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = lshr <4 x i32> [[X]], splat (i32 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[R:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[S]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[R]] ; %s = lshr <4 x i32> %x, @@ -137,15 +111,10 @@ define <4 x i16> @NarrowLShrI32By5ToU16(<4 x i32> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = lshr <4 x i32> [[X]], splat (i32 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[R:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[S]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[R]] ; %s = lshr <4 x i32> %x, @@ -162,15 +131,10 @@ define <2 x i32> @NarrowAShri64By5(<2 x i64> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <2 x i64> [[X]], splat (i64 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <2 x i64> [[_MSPROP]] to <2 x i32> ; CHECK-NEXT: [[R:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[S]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[R]] ; %s = ashr <2 x i64> %x, @@ -186,15 +150,10 @@ define <2 x i32> @NarrowAShrU64By5(<2 x i64> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <2 x i64> [[X]], splat (i64 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <2 x i64> [[_MSPROP]] to <2 x i32> ; CHECK-NEXT: [[R:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[S]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[R]] ; %s = ashr <2 x i64> %x, @@ -210,15 +169,10 @@ define <2 x i32> @NarrowAShri64By5ToU32(<2 x i64> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <2 x i64> [[X]], splat (i64 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <2 x i64> [[_MSPROP]] to <2 x i32> ; CHECK-NEXT: [[R:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[S]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[R]] ; %s = ashr <2 x i64> %x, @@ -234,15 +188,10 @@ define <2 x i32> @NarrowLShri64By5(<2 x i64> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = lshr <2 x i64> [[X]], splat (i64 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <2 x i64> [[_MSPROP]] to <2 x i32> ; CHECK-NEXT: [[R:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[S]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[R]] ; %s = lshr <2 x i64> %x, @@ -258,15 +207,10 @@ define <2 x i32> @NarrowLShrU64By5(<2 x i64> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = lshr <2 x i64> [[X]], splat (i64 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <2 x i64> [[_MSPROP]] to <2 x i32> ; CHECK-NEXT: [[R:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[S]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[R]] ; %s = lshr <2 x i64> %x, @@ -282,15 +226,10 @@ define <2 x i32> @NarrowLShri64By5ToU32(<2 x i64> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = lshr <2 x i64> [[X]], splat (i64 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = trunc <2 x i64> [[_MSPROP]] to <2 x i32> ; CHECK-NEXT: [[R:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[S]]) -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i32> [[R]] ; %s = lshr <2 x i64> %x, @@ -307,15 +246,10 @@ define <8 x i8> @NarrowAShri16By5(<8 x i16> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i16> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <8 x i16> [[X]], splat (i16 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[R:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[S]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[R]] ; %s = ashr <8 x i16> %x, @@ -331,15 +265,10 @@ define <8 x i8> @NarrowAShrU16By5(<8 x i16> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i16> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <8 x i16> [[X]], splat (i16 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[R:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[S]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[R]] ; %s = ashr <8 x i16> %x, @@ -355,15 +284,10 @@ define <8 x i8> @NarrowAShri16By5ToU8(<8 x i16> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i16> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <8 x i16> [[X]], splat (i16 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[R:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[S]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[R]] ; %s = ashr <8 x i16> %x, @@ -379,15 +303,10 @@ define <8 x i8> @NarrowLShri16By5(<8 x i16> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i16> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = lshr <8 x i16> [[X]], splat (i16 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[R:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[S]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[R]] ; %s = lshr <8 x i16> %x, @@ -403,15 +322,10 @@ define <8 x i8> @NarrowLShrU16By5(<8 x i16> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i16> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = lshr <8 x i16> [[X]], splat (i16 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[R:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[S]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[R]] ; %s = lshr <8 x i16> %x, @@ -427,15 +341,10 @@ define <8 x i8> @NarrowLShri16By5ToU8(<8 x i16> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 5) ; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i16> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = lshr <8 x i16> [[X]], splat (i16 5) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[R:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[S]]) -; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i8> [[R]] ; %s = lshr <8 x i16> %x, @@ -455,15 +364,10 @@ define <4 x i16> @NarrowAShrI32By31(<4 x i32> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 16) ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <4 x i32> [[X]], splat (i32 16) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[R:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[S]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[R]] ; %s = ashr <4 x i32> %x, @@ -479,15 +383,10 @@ define <4 x i16> @NarrowAShrI32By31ToU16(<4 x i32> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 16) ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = ashr <4 x i32> [[X]], splat (i32 16) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[R:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[S]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[R]] ; %s = ashr <4 x i32> %x, @@ -503,15 +402,10 @@ define <4 x i16> @NarrowLShrU32By31(<4 x i32> %x) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 16) ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[S:%.*]] = lshr <4 x i32> [[X]], splat (i32 16) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i16> ; CHECK-NEXT: [[R:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[S]]) -; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i16> [[R]] ; %s = lshr <4 x i32> %x, @@ -748,6 +642,3 @@ entry: } attributes #0 = { sanitize_memory } -;. -; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} -;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/scmp.ll b/llvm/test/Instrumentation/MemorySanitizer/scmp.ll index 89c5b283b2510..5c94c216106a2 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/scmp.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/scmp.ll @@ -1,14 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt < %s -passes=msan -S | FileCheck %s ; -; llvm.scmp is correctly handled heuristically when each parameter is the same -; type as the return type e.g., -; call i8 @llvm.scmp.i8.i8(i8 %x, i8 %y) -; but handled incorrectly by visitInstruction when the return type is different -; e.g., -; call i8 @llvm.scmp.i8.i62(i62 %x, i62 %y) -; call <4 x i8> @llvm.scmp.v4i8.v4i32(<4 x i32> %x, <4 x i32> %y) -; ; Forked from llvm/test/CodeGen/X86/scmp.ll target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -21,8 +13,9 @@ define i8 @scmp.8.8(i8 %x, i8 %y) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = or i8 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i8 [[_MSPROP]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[X]], i8 [[Y]]) -; CHECK-NEXT: store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP3]] ; %1 = call i8 @llvm.scmp(i8 %x, i8 %y) @@ -35,16 +28,11 @@ define i8 @scmp.8.16(i16 %x, i16 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i16 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i16 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i16 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i16 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.scmp.i8.i16(i16 [[X]], i16 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.scmp(i16 %x, i16 %y) @@ -57,16 +45,11 @@ define i8 @scmp.8.32(i32 %x, i32 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.scmp(i32 %x, i32 %y) @@ -79,16 +62,11 @@ define i8 @scmp.8.64(i64 %x, i64 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.scmp.i8.i64(i64 [[X]], i64 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.scmp(i64 %x, i64 %y) @@ -101,16 +79,11 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i128, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i128 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i128 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.scmp.i8.i128(i128 [[X]], i128 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.scmp(i128 %x, i128 %y) @@ -124,8 +97,9 @@ define i32 @scmp.32.32(i32 %x, i32 %y) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 [[Y]]) -; CHECK-NEXT: store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %1 = call i32 @llvm.scmp(i32 %x, i32 %y) @@ -138,16 +112,11 @@ define i32 @scmp.32.64(i64 %x, i64 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[_MSPROP1]] to i32 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.scmp.i32.i64(i64 [[X]], i64 [[Y]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP5]] ; %1 = call i32 @llvm.scmp(i64 %x, i64 %y) @@ -161,8 +130,9 @@ define i64 @scmp.64.64(i64 %x, i64 %y) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.scmp.i64.i64(i64 [[X]], i64 [[Y]]) -; CHECK-NEXT: store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %1 = call i64 @llvm.scmp(i64 %x, i64 %y) @@ -175,16 +145,11 @@ define i4 @scmp_narrow_result(i32 %x, i32 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[_MSPROP1]] to i4 ; CHECK-NEXT: [[TMP5:%.*]] = call i4 @llvm.scmp.i4.i32(i32 [[X]], i32 [[Y]]) -; CHECK-NEXT: store i4 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i4 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i4 [[TMP5]] ; %1 = call i4 @llvm.scmp(i32 %x, i32 %y) @@ -197,16 +162,11 @@ define i8 @scmp_narrow_op(i62 %x, i62 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i62, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i62, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i62 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i62 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i62 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i62 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i62 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.scmp.i8.i62(i62 [[X]], i62 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.scmp(i62 %x, i62 %y) @@ -219,16 +179,11 @@ define i141 @scmp_wide_result(i32 %x, i32 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[_MSPROP1]] to i141 ; CHECK-NEXT: [[TMP5:%.*]] = call i141 @llvm.scmp.i141.i32(i32 [[X]], i32 [[Y]]) -; CHECK-NEXT: store i141 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i141 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i141 [[TMP5]] ; %1 = call i141 @llvm.scmp(i32 %x, i32 %y) @@ -241,16 +196,11 @@ define i8 @scmp_wide_op(i109 %x, i109 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i109, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i109, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i109 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i109 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i109 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i109 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i109 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.scmp.i8.i109(i109 [[X]], i109 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.scmp(i109 %x, i109 %y) @@ -263,16 +213,11 @@ define i41 @scmp_uncommon_types(i7 %x, i7 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i7, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i7, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i7 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i7 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i7 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i7 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = zext i7 [[_MSPROP1]] to i41 ; CHECK-NEXT: [[TMP5:%.*]] = call i41 @llvm.scmp.i41.i7(i7 [[X]], i7 [[Y]]) -; CHECK-NEXT: store i41 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i41 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i41 [[TMP5]] ; %1 = call i41 @llvm.scmp(i7 %x, i7 %y) @@ -286,8 +231,9 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> [[X]], <4 x i32> [[Y]]) -; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP3]] ; %1 = call <4 x i32> @llvm.scmp(<4 x i32> %x, <4 x i32> %y) @@ -300,18 +246,11 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind #0 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[_MSPROP1]] to <4 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i8> @llvm.scmp.v4i8.v4i32(<4 x i32> [[X]], <4 x i32> [[Y]]) -; CHECK-NEXT: store <4 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i8> [[TMP7]] ; %1 = call <4 x i8> @llvm.scmp(<4 x i32> %x, <4 x i32> %y) @@ -324,18 +263,11 @@ define <4 x i32> @scmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i8> [[TMP1]] to i32 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[TMP2]] to i32 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[_MSPROP1]] to <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.scmp.v4i32.v4i8(<4 x i8> [[X]], <4 x i8> [[Y]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP7]] ; %1 = call <4 x i32> @llvm.scmp(<4 x i8> %x, <4 x i8> %y) @@ -348,18 +280,11 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind #0 ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[_MSPROP1]] to <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.scmp.v16i32.v16i8(<16 x i8> [[X]], <16 x i8> [[Y]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP7]] ; %1 = call <16 x i32> @llvm.scmp(<16 x i8> %x, <16 x i8> %y) @@ -372,18 +297,11 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i64> [[TMP1]] to i1024 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i1024 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i64> [[TMP2]] to i1024 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i1024 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i64> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i64> [[_MSPROP1]] to <16 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.scmp.v16i8.v16i64(<16 x i64> [[X]], <16 x i64> [[Y]]) -; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[TMP7]] ; %1 = call <16 x i8> @llvm.scmp(<16 x i64> %x, <16 x i64> %y) @@ -396,18 +314,11 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <7 x i7>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <7 x i7>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <7 x i7> [[TMP1]] to i49 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i49 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <7 x i7> [[TMP2]] to i49 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i49 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <7 x i7> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <7 x i7> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = zext <7 x i7> [[_MSPROP1]] to <7 x i117> ; CHECK-NEXT: [[TMP7:%.*]] = call <7 x i117> @llvm.scmp.v7i117.v7i7(<7 x i7> [[X]], <7 x i7> [[Y]]) -; CHECK-NEXT: store <7 x i117> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <7 x i117> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <7 x i117> [[TMP7]] ; %1 = call <7 x i117> @llvm.scmp(<7 x i7> %x, <7 x i7> %y) @@ -420,18 +331,11 @@ define <1 x i3> @scmp_scalarize(<1 x i33> %x, <1 x i33> %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i33>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i33>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i33> [[TMP1]] to i33 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i33 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i33> [[TMP2]] to i33 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i33 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i33> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <1 x i33> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = trunc <1 x i33> [[_MSPROP1]] to <1 x i3> ; CHECK-NEXT: [[TMP7:%.*]] = call <1 x i3> @llvm.scmp.v1i3.v1i33(<1 x i33> [[X]], <1 x i33> [[Y]]) -; CHECK-NEXT: store <1 x i3> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i3> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i3> [[TMP7]] ; %1 = call <1 x i3> @llvm.scmp(<1 x i33> %x, <1 x i33> %y) @@ -444,18 +348,11 @@ define <2 x i8> @scmp_bool_operands(<2 x i1> %x, <2 x i1> %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i1>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i1> [[TMP1]] to i2 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i2 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i1> [[TMP2]] to i2 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i2 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i1> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i1> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i1> [[_MSPROP1]] to <2 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x i8> @llvm.scmp.v2i8.v2i1(<2 x i1> [[X]], <2 x i1> [[Y]]) -; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i8> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i8> [[TMP7]] ; %1 = call <2 x i8> @llvm.scmp(<2 x i1> %x, <2 x i1> %y) @@ -468,18 +365,11 @@ define <2 x i16> @scmp_ret_wider_than_operands(<2 x i8> %x, <2 x i8> %y) nounwin ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i8> [[TMP1]] to i16 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i16 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i8> [[TMP2]] to i16 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <2 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[_MSPROP1]] to <2 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x i16> @llvm.scmp.v2i16.v2i8(<2 x i8> [[X]], <2 x i8> [[Y]]) -; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i16> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x i16> [[TMP7]] ; %1 = call <2 x i16> @llvm.scmp(<2 x i8> %x, <2 x i8> %y) @@ -487,6 +377,3 @@ define <2 x i16> @scmp_ret_wider_than_operands(<2 x i8> %x, <2 x i8> %y) nounwin } attributes #0 = { sanitize_memory } -;. -; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} -;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/ucmp.ll b/llvm/test/Instrumentation/MemorySanitizer/ucmp.ll index 5e0a248afe748..1b70242dae2b5 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/ucmp.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/ucmp.ll @@ -1,14 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt < %s -passes=msan -S | FileCheck %s ; -; llvm.ucmp is correctly handled heuristically when each parameter is the same -; type as the return type e.g., -; call i8 @llvm.ucmp.i8.i8(i8 %x, i8 %y) -; but handled incorrectly by visitInstruction when the return type is different -; e.g., -; call i8 @llvm.ucmp.i8.i62(i62 %x, i62 %y) -; call <4 x i8> @llvm.ucmp.v4i8.v4i32(<4 x i32> %x, <4 x i32> %y) - ; Forked from llvm/test/CodeGen/X86/ucmp.ll target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -21,8 +13,9 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = or i8 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i8 [[_MSPROP]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.ucmp.i8.i8(i8 [[X]], i8 [[Y]]) -; CHECK-NEXT: store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP3]] ; %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) @@ -35,16 +28,11 @@ define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i16 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i16 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i16 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i16 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i16 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.ucmp.i8.i16(i16 [[X]], i16 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) @@ -57,16 +45,11 @@ define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.ucmp.i8.i32(i32 [[X]], i32 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) @@ -79,16 +62,11 @@ define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.ucmp.i8.i64(i64 [[X]], i64 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.ucmp(i64 %x, i64 %y) @@ -101,16 +79,11 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i128, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i128 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i128 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.ucmp.i8.i128(i128 [[X]], i128 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.ucmp(i128 %x, i128 %y) @@ -124,8 +97,9 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[X]], i32 [[Y]]) -; CHECK-NEXT: store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP3]] ; %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) @@ -138,16 +112,11 @@ define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[_MSPROP1]] to i32 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.ucmp.i32.i64(i64 [[X]], i64 [[Y]]) -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i32 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP5]] ; %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) @@ -161,8 +130,9 @@ define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.ucmp.i64.i64(i64 [[X]], i64 [[Y]]) -; CHECK-NEXT: store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) @@ -175,16 +145,11 @@ define i4 @ucmp_narrow_result(i32 %x, i32 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[_MSPROP1]] to i4 ; CHECK-NEXT: [[TMP5:%.*]] = call i4 @llvm.ucmp.i4.i32(i32 [[X]], i32 [[Y]]) -; CHECK-NEXT: store i4 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i4 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i4 [[TMP5]] ; %1 = call i4 @llvm.ucmp(i32 %x, i32 %y) @@ -197,16 +162,11 @@ define i8 @ucmp_narrow_op(i62 %x, i62 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i62, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i62, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i62 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i62 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i62 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i62 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i62 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.ucmp.i8.i62(i62 [[X]], i62 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.ucmp(i62 %x, i62 %y) @@ -219,16 +179,11 @@ define i141 @ucmp_wide_result(i32 %x, i32 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[_MSPROP1]] to i141 ; CHECK-NEXT: [[TMP5:%.*]] = call i141 @llvm.ucmp.i141.i32(i32 [[X]], i32 [[Y]]) -; CHECK-NEXT: store i141 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i141 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i141 [[TMP5]] ; %1 = call i141 @llvm.ucmp(i32 %x, i32 %y) @@ -241,16 +196,11 @@ define i8 @ucmp_wide_op(i109 %x, i109 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i109, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i109, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i109 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i109 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i109 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i109 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i109 [[_MSPROP1]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.ucmp.i8.i109(i109 [[X]], i109 [[Y]]) -; CHECK-NEXT: store i8 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i8 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i8 [[TMP5]] ; %1 = call i8 @llvm.ucmp(i109 %x, i109 %y) @@ -263,16 +213,11 @@ define i41 @ucmp_uncommon_types(i7 %x, i7 %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load i7, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load i7, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i7 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i7 [[TMP2]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]] -; CHECK: 3: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 4: +; CHECK-NEXT: [[_MSPROP:%.*]] = or i7 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or i7 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = zext i7 [[_MSPROP1]] to i41 ; CHECK-NEXT: [[TMP5:%.*]] = call i41 @llvm.ucmp.i41.i7(i7 [[X]], i7 [[Y]]) -; CHECK-NEXT: store i41 0, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store i41 [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i41 [[TMP5]] ; %1 = call i41 @llvm.ucmp(i7 %x, i7 %y) @@ -286,8 +231,9 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> [[X]], <4 x i32> [[Y]]) -; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP3]] ; %1 = call <4 x i32> @llvm.ucmp(<4 x i32> %x, <4 x i32> %y) @@ -300,18 +246,11 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind #0 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[_MSPROP1]] to <4 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i8> @llvm.ucmp.v4i8.v4i32(<4 x i32> [[X]], <4 x i32> [[Y]]) -; CHECK-NEXT: store <4 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i8> [[TMP7]] ; %1 = call <4 x i8> @llvm.ucmp(<4 x i32> %x, <4 x i32> %y) @@ -324,18 +263,11 @@ define <4 x i32> @ucmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i8> [[TMP1]] to i32 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i8> [[TMP2]] to i32 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[_MSPROP1]] to <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.ucmp.v4i32.v4i8(<4 x i8> [[X]], <4 x i8> [[Y]]) -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP7]] ; %1 = call <4 x i32> @llvm.ucmp(<4 x i8> %x, <4 x i8> %y) @@ -348,18 +280,11 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind #0 ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[_MSPROP1]] to <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.ucmp.v16i32.v16i8(<16 x i8> [[X]], <16 x i8> [[Y]]) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP7]] ; %1 = call <16 x i32> @llvm.ucmp(<16 x i8> %x, <16 x i8> %y) @@ -372,18 +297,11 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i32> [[_MSPROP1]] to <16 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ucmp.v16i8.v16i32(<16 x i32> [[X]], <16 x i32> [[Y]]) -; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i8> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i8> [[TMP7]] ; %1 = call <16 x i8> @llvm.ucmp(<16 x i32> %x, <16 x i32> %y) @@ -396,18 +314,11 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind # ; CHECK-NEXT: [[TMP1:%.*]] = load <17 x i71>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <17 x i71>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <17 x i71> [[TMP1]] to i1207 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i1207 [[TMP3]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <17 x i71> [[TMP2]] to i1207 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i1207 [[TMP4]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]] -; CHECK: 5: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] -; CHECK-NEXT: unreachable -; CHECK: 6: +; CHECK-NEXT: [[_MSPROP:%.*]] = or <17 x i71> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <17 x i71> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = trunc <17 x i71> [[_MSPROP1]] to <17 x i2> ; CHECK-NEXT: [[TMP7:%.*]] = call <17 x i2> @llvm.ucmp.v17i2.v17i71(<17 x i71> [[X]], <17 x i71> [[Y]]) -; CHECK-NEXT: store <17 x i2> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <17 x i2> [[TMP3]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <17 x i2> [[TMP7]] ; %1 = call <17 x i2> @llvm.ucmp(<17 x i71> %x, <17 x i71> %y) @@ -415,6 +326,3 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind # } attributes #0 = { sanitize_memory } -;. -; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} -;. diff --git a/llvm/test/LTO/X86/coro.ll b/llvm/test/LTO/X86/coro.ll new file mode 100644 index 0000000000000..cde398dd76d85 --- /dev/null +++ b/llvm/test/LTO/X86/coro.ll @@ -0,0 +1,21 @@ +; RUN: llvm-as %s -o %t1.bc +; RUN: llvm-lto2 run %t1.bc -o %t2.o -r=%t1.bc,test,plx -r=%t1.bc,extern_func,plx -save-temps +; RUN: llvm-dis %t2.o.0.5.precodegen.bc -o - | FileCheck %s --implicit-check-not="call void @llvm.coro" + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-fuchsia" + +declare void @extern_func() + +; CHECK: define {{.*}} void @test( +define void @test(ptr %hdl) { + call void @llvm.coro.resume(ptr %hdl) + call void @llvm.coro.destroy(ptr %hdl) + call i1 @llvm.coro.done(ptr %hdl) + ret void +} + +declare void @llvm.coro.resume(ptr) +declare void @llvm.coro.destroy(ptr) +declare i1 @llvm.coro.done(ptr) + diff --git a/llvm/test/MC/AArch64/armv8.1a-lse.s b/llvm/test/MC/AArch64/armv8.1a-lse.s index b5bbbe66c6ae2..eb9a30dea9e6b 100644 --- a/llvm/test/MC/AArch64/armv8.1a-lse.s +++ b/llvm/test/MC/AArch64/armv8.1a-lse.s @@ -7,6 +7,8 @@ // RUN: not llvm-mc -triple aarch64-none-linux-gnu -mcpu=tsv110 -show-encoding < %s 2> %t | FileCheck %s // RUN: FileCheck -check-prefix=CHECK-ERROR < %t %s // RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8r -show-encoding < %s 2> %t | FileCheck %s +// RUN: FileCheck -check-prefix=CHECK-ERROR < %t %s +// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a,+lse,+lsui -show-encoding < %s 2> %t | FileCheck %s // RUN: FileCheck -check-prefix=CHECK-ERROR < %t %s .text diff --git a/llvm/test/MC/AArch64/armv9.6a-lsui.s b/llvm/test/MC/AArch64/armv9.6a-lsui.s index b48db1f9b5570..d4a5e1f980560 100644 --- a/llvm/test/MC/AArch64/armv9.6a-lsui.s +++ b/llvm/test/MC/AArch64/armv9.6a-lsui.s @@ -313,42 +313,16 @@ _func: // ERROR: instruction requires: lsui sttaddl w0, [x2] -// CHECK: ldtadd w0, wzr, [x2] // encoding: [0x5f,0x04,0x20,0x19] +// CHECK: ldtaddl w0, wzr, [x2] // encoding: [0x5f,0x04,0x60,0x19] // ERROR: instruction requires: lsui sttaddl w2, [sp] -// CHECK: ldtadd w2, wzr, [sp] // encoding: [0xff,0x07,0x22,0x19] +// CHECK: ldtaddl w2, wzr, [sp] // encoding: [0xff,0x07,0x62,0x19] // ERROR: instruction requires: lsui sttaddl x0, [x2] -// CHECK: ldtadd x0, xzr, [x2] // encoding: [0x5f,0x04,0x20,0x59] +// CHECK: ldtaddl x0, xzr, [x2] // encoding: [0x5f,0x04,0x60,0x59] // ERROR: instruction requires: lsui sttaddl x2, [sp] -// CHECK: ldtadd x2, xzr, [sp] // encoding: [0xff,0x07,0x22,0x59] -// ERROR: instruction requires: lsui - - sttadda w0, [x2] -// CHECK: ldtadd w0, wzr, [x2] // encoding: [0x5f,0x04,0x20,0x19] -// ERROR: instruction requires: lsui - sttadda w2, [sp] -// CHECK: ldtadd w2, wzr, [sp] // encoding: [0xff,0x07,0x22,0x19] -// ERROR: instruction requires: lsui - sttadda x0, [x2] -// CHECK: ldtadd x0, xzr, [x2] // encoding: [0x5f,0x04,0x20,0x59] -// ERROR: instruction requires: lsui - sttadda x2, [sp] -// CHECK: ldtadd x2, xzr, [sp] // encoding: [0xff,0x07,0x22,0x59] -// ERROR: instruction requires: lsui - - sttaddal w0, [x2] -// CHECK: ldtadd w0, wzr, [x2] // encoding: [0x5f,0x04,0x20,0x19] -// ERROR: instruction requires: lsui - sttaddal w2, [sp] -// CHECK: ldtadd w2, wzr, [sp] // encoding: [0xff,0x07,0x22,0x19] -// ERROR: instruction requires: lsui - sttaddal x0, [x2] -// CHECK: ldtadd x0, xzr, [x2] // encoding: [0x5f,0x04,0x20,0x59] -// ERROR: instruction requires: lsui - sttaddal x2, [sp] -// CHECK: ldtadd x2, xzr, [sp] // encoding: [0xff,0x07,0x22,0x59] +// CHECK: ldtaddl x2, xzr, [sp] // encoding: [0xff,0x07,0x62,0x59] // ERROR: instruction requires: lsui sttclr w0, [x2] @@ -362,45 +336,19 @@ _func: // ERROR: instruction requires: lsui sttclr x2, [sp] // CHECK: ldtclr x2, xzr, [sp] // encoding: [0xff,0x17,0x22,0x59] -// ERROR: instruction requires: lsui - - sttclra w0, [x2] -// CHECK: ldtclr w0, wzr, [x2] // encoding: [0x5f,0x14,0x20,0x19] -// ERROR: instruction requires: lsui - sttclra w2, [sp] -// CHECK: ldtclr w2, wzr, [sp] // encoding: [0xff,0x17,0x22,0x19] -// ERROR: instruction requires: lsui - sttclra x0, [x2] -// CHECK: ldtclr x0, xzr, [x2] // encoding: [0x5f,0x14,0x20,0x59] -// ERROR: instruction requires: lsui - sttclra x2, [sp] -// CHECK: ldtclr x2, xzr, [sp] // encoding: [0xff,0x17,0x22,0x59] // ERROR: instruction requires: lsui sttclrl w0, [x2] -// CHECK: ldtclr w0, wzr, [x2] // encoding: [0x5f,0x14,0x20,0x19] +// CHECK: ldtclrl w0, wzr, [x2] // encoding: [0x5f,0x14,0x60,0x19] // ERROR: instruction requires: lsui sttclrl w2, [sp] -// CHECK: ldtclr w2, wzr, [sp] // encoding: [0xff,0x17,0x22,0x19] +// CHECK: ldtclrl w2, wzr, [sp] // encoding: [0xff,0x17,0x62,0x19] // ERROR: instruction requires: lsui sttclrl x0, [x2] -// CHECK: ldtclr x0, xzr, [x2] // encoding: [0x5f,0x14,0x20,0x59] +// CHECK: ldtclrl x0, xzr, [x2] // encoding: [0x5f,0x14,0x60,0x59] // ERROR: instruction requires: lsui sttclrl x2, [sp] -// CHECK: ldtclr x2, xzr, [sp] // encoding: [0xff,0x17,0x22,0x59] -// ERROR: instruction requires: lsui - - sttclral w0, [x2] -// CHECK: ldtclr w0, wzr, [x2] // encoding: [0x5f,0x14,0x20,0x19] -// ERROR: instruction requires: lsui - sttclral x2, [sp] -// CHECK: ldtclr x2, xzr, [sp] // encoding: [0xff,0x17,0x22,0x59] -// ERROR: instruction requires: lsui - sttclral x0, [x2] -// CHECK: ldtclr x0, xzr, [x2] // encoding: [0x5f,0x14,0x20,0x59] -// ERROR: instruction requires: lsui - sttclral x2, [sp] -// CHECK: ldtclr x2, xzr, [sp] // encoding: [0xff,0x17,0x22,0x59] +// CHECK: ldtclrl x2, xzr, [sp] // encoding: [0xff,0x17,0x62,0x59] // ERROR: instruction requires: lsui sttset w0, [x2] @@ -414,45 +362,19 @@ _func: // ERROR: instruction requires: lsui sttset x2, [sp] // CHECK: ldtset x2, xzr, [sp] // encoding: [0xff,0x37,0x22,0x59] -// ERROR: instruction requires: lsui - - sttseta w0, [x2] -// CHECK: ldtset w0, wzr, [x2] // encoding: [0x5f,0x34,0x20,0x19] -// ERROR: instruction requires: lsui - sttseta w2, [sp] -// CHECK: ldtset w2, wzr, [sp] // encoding: [0xff,0x37,0x22,0x19] -// ERROR: instruction requires: lsui - sttseta x0, [x2] -// CHECK: ldtset x0, xzr, [x2] // encoding: [0x5f,0x34,0x20,0x59] -// ERROR: instruction requires: lsui - sttseta x2, [sp] -// CHECK: ldtset x2, xzr, [sp] // encoding: [0xff,0x37,0x22,0x59] // ERROR: instruction requires: lsui sttsetl w0, [x2] -// CHECK: ldtset w0, wzr, [x2] // encoding: [0x5f,0x34,0x20,0x19] +// CHECK: ldtsetl w0, wzr, [x2] // encoding: [0x5f,0x34,0x60,0x19] // ERROR: instruction requires: lsui sttsetl w2, [sp] -// CHECK: ldtset w2, wzr, [sp] // encoding: [0xff,0x37,0x22,0x19] +// CHECK: ldtsetl w2, wzr, [sp] // encoding: [0xff,0x37,0x62,0x19] // ERROR: instruction requires: lsui sttsetl x0, [x2] -// CHECK: ldtset x0, xzr, [x2] // encoding: [0x5f,0x34,0x20,0x59] +// CHECK: ldtsetl x0, xzr, [x2] // encoding: [0x5f,0x34,0x60,0x59] // ERROR: instruction requires: lsui sttsetl x2, [sp] -// CHECK: ldtset x2, xzr, [sp] // encoding: [0xff,0x37,0x22,0x59] -// ERROR: instruction requires: lsui - - sttsetal w0, [x2] -// CHECK: ldtset w0, wzr, [x2] // encoding: [0x5f,0x34,0x20,0x19] -// ERROR: instruction requires: lsui - sttsetal x2, [sp] -// CHECK: ldtset x2, xzr, [sp] // encoding: [0xff,0x37,0x22,0x59] -// ERROR: instruction requires: lsui - sttsetal x0, [x2] -// CHECK: ldtset x0, xzr, [x2] // encoding: [0x5f,0x34,0x20,0x59] -// ERROR: instruction requires: lsui - sttsetal x2, [sp] -// CHECK: ldtset x2, xzr, [sp] // encoding: [0xff,0x37,0x22,0x59] +// CHECK: ldtsetl x2, xzr, [sp] // encoding: [0xff,0x37,0x62,0x59] // ERROR: instruction requires: lsui //------------------------------------------------------------------------------ diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v4.s index 3733b162edcfb..9ab177cf2b125 100644 --- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-diag-v4.s @@ -280,6 +280,16 @@ .amdhsa_shared_vgpr_count 15 .end_amdhsa_kernel +// GCN-LABEL: warning: test_amdhsa_inst_pref_size_invalid +// PREGFX10: error: directive requires gfx11+ +// NONAMDHSA: error: unknown directive +.warning "test_amdhsa_inst_pref_size_invalid" +.amdhsa_kernel test_amdhsa_inst_pref_size_invalid + .amdhsa_next_free_vgpr 273 + .amdhsa_next_free_sgpr 0 + .amdhsa_inst_pref_size 15 +.end_amdhsa_kernel + // GCN-LABEL: warning: test_next_free_vgpr_invalid // AMDHSA: error: .amdgcn.next_free_{v,s}gpr symbols must be absolute expressions // NONAMDHSA-NOT: error: diff --git a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s index ea649bc76116a..e90a976008229 100644 --- a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s @@ -33,7 +33,7 @@ // complete // OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 f00f0000 // OBJDUMP-NEXT: 0070 015021e4 1f0f007f 5e040000 00000000 // special_sgpr // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 @@ -120,6 +120,7 @@ disabled_user_sgpr: .amdhsa_workgroup_processor_mode 1 .amdhsa_memory_ordered 1 .amdhsa_forward_progress 1 + .amdhsa_inst_pref_size 255 .amdhsa_round_robin_scheduling 1 .amdhsa_exception_fp_ieee_invalid_op 1 .amdhsa_exception_fp_denorm_src 1 @@ -158,6 +159,7 @@ disabled_user_sgpr: // ASM-NEXT: .amdhsa_workgroup_processor_mode 1 // ASM-NEXT: .amdhsa_memory_ordered 1 // ASM-NEXT: .amdhsa_forward_progress 1 +// ASM-NEXT: .amdhsa_inst_pref_size 255 // ASM-NEXT: .amdhsa_round_robin_scheduling 1 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 // ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s index 85a7ad05b00f4..68cf28f2ac49d 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s @@ -133,6 +133,7 @@ expr_defined: // ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 // ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 // ASM-NEXT: .amdhsa_shared_vgpr_count 0 +// ASM-NEXT: .amdhsa_inst_pref_size 0 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&16777216)>>24 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&33554432)>>25 // ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&67108864)>>26 @@ -180,6 +181,7 @@ expr_defined: // ASM-NEXT: .amdhsa_memory_ordered 1 // ASM-NEXT: .amdhsa_forward_progress 1 // ASM-NEXT: .amdhsa_shared_vgpr_count 0 +// ASM-NEXT: .amdhsa_inst_pref_size 0 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 // ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 // ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1 diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s index 51d0fb30b320c..6f7a9a2605681 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s @@ -9,12 +9,12 @@ // expr_defined_later // OBJDUMP-NEXT: 0000 2b000000 2c000000 00000000 00000000 // OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 f0020000 // OBJDUMP-NEXT: 0030 05f02fe4 811f007f 000c0000 00000000 // expr_defined // OBJDUMP-NEXT: 0040 2a000000 2b000000 00000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 -// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 f0020000 // OBJDUMP-NEXT: 0070 05f02fe4 811f007f 000c0000 00000000 .text @@ -53,6 +53,7 @@ expr_defined: .amdhsa_workgroup_processor_mode defined_boolean .amdhsa_memory_ordered defined_boolean .amdhsa_forward_progress defined_boolean + .amdhsa_inst_pref_size defined_value+6 .amdhsa_exception_fp_ieee_invalid_op defined_boolean .amdhsa_exception_fp_denorm_src defined_boolean .amdhsa_exception_fp_ieee_div_zero defined_boolean @@ -89,6 +90,7 @@ expr_defined: .amdhsa_workgroup_processor_mode defined_boolean .amdhsa_memory_ordered defined_boolean .amdhsa_forward_progress defined_boolean + .amdhsa_inst_pref_size defined_value+6 .amdhsa_exception_fp_ieee_invalid_op defined_boolean .amdhsa_exception_fp_denorm_src defined_boolean .amdhsa_exception_fp_ieee_div_zero defined_boolean @@ -132,6 +134,7 @@ expr_defined: // ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29 // ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 // ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 +// ASM-NEXT: .amdhsa_inst_pref_size (((defined_value+6)<<4)&4080)>>4 // ASM-NEXT: .amdhsa_round_robin_scheduling (((((((((((((((((((((1611399168|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&16777216)>>24 // ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&33554432)>>25 @@ -177,6 +180,7 @@ expr_defined: // ASM-NEXT: .amdhsa_workgroup_processor_mode 1 // ASM-NEXT: .amdhsa_memory_ordered 1 // ASM-NEXT: .amdhsa_forward_progress 1 +// ASM-NEXT: .amdhsa_inst_pref_size 47 // ASM-NEXT: .amdhsa_round_robin_scheduling 1 // ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 // ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 diff --git a/llvm/test/Other/new-pm-O0-defaults.ll b/llvm/test/Other/new-pm-O0-defaults.ll index e8131ac7fab45..81d1ee0df2c5b 100644 --- a/llvm/test/Other/new-pm-O0-defaults.ll +++ b/llvm/test/Other/new-pm-O0-defaults.ll @@ -51,6 +51,7 @@ ; CHECK-LTO-NEXT: Running analysis: InnerAnalysisManagerProxy ; CHECK-LTO-NEXT: Running pass: LowerTypeTestsPass ; CHECK-LTO-NEXT: Running pass: LowerTypeTestsPass +; CHECK-LTO-NEXT: CoroConditionalWrapper ; CHECK-CORO-NEXT: Running pass: AnnotationRemarksPass ; CHECK-CORO-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-LTO-NEXT: Running pass: AnnotationRemarksPass diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll index 86480c5115748..3aea0f2061f3e 100644 --- a/llvm/test/Other/new-pm-lto-defaults.ll +++ b/llvm/test/Other/new-pm-lto-defaults.ll @@ -67,6 +67,7 @@ ; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: GlobalSplitPass ; CHECK-O-NEXT: Running pass: WholeProgramDevirtPass +; CHECK-O23SZ-NEXT: Running pass: CoroEarlyPass ; CHECK-O1-NEXT: Running pass: LowerTypeTestsPass ; CHECK-O23SZ-NEXT: Running pass: GlobalOptPass ; CHECK-O23SZ-NEXT: Running pass: PromotePass @@ -85,7 +86,9 @@ ; CHECK-O23SZ-NEXT: Running pass: GlobalOptPass ; CHECK-O23SZ-NEXT: Running pass: OpenMPOptPass ; CHECK-O23SZ-NEXT: Running pass: GlobalDCEPass -; CHECK-O23SZ-NEXT: Running pass: ArgumentPromotionPass +; CHECK-O23SZ-NEXT: Running pass: ArgumentPromotionPass on (foo) +; CHECK-O23SZ-NEXT: CoroSplitPass on (foo) +; CHECK-O23SZ-NEXT: CoroAnnotationElidePass on (foo) ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass ; CHECK-O23SZ-NEXT: Running pass: ConstraintEliminationPass @@ -156,6 +159,8 @@ ; CHECK-O23SZ-NEXT: Running pass: GlobalDCEPass ; CHECK-O23SZ-NEXT: Running pass: RelLookupTableConverterPass ; CHECK-O23SZ-NEXT: Running pass: CGProfilePass +; CHECK-O1-NEXT: Running pass: CoroConditionalWrapper +; CHECK-O23SZ-NEXT: Running pass: CoroCleanupPass ; CHECK-EP-NEXT: Running pass: NoOpModulePass ; CHECK-O-NEXT: Running pass: AnnotationRemarksPass on foo ; CHECK-O-NEXT: Running pass: PrintModulePass diff --git a/llvm/test/TableGen/get-named-operand-idx.td b/llvm/test/TableGen/get-named-operand-idx.td new file mode 100644 index 0000000000000..f5c5d93f9e522 --- /dev/null +++ b/llvm/test/TableGen/get-named-operand-idx.td @@ -0,0 +1,85 @@ +// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s | FileCheck %s + +// Check that OpName enum and getNamedOperandIdx are as expected. + +include "llvm/Target/Target.td" + +def archInstrInfo : InstrInfo { } + +def arch : Target { + let InstructionSet = archInstrInfo; +} + +class InstBase : Instruction { + let Namespace = "MyNamespace"; + let UseNamedOperandTable = 1; + let Size = 1; + field bits<8> Inst; +} + +def Reg : Register<"reg">; +def RegClass : RegisterClass<"foo", [i32], 0, (add Reg)>; + +def OpA : Operand; +def OpB : Operand; + +def RegOp : RegisterOperand; + +def InstA : InstBase { + let OutOperandList = (outs OpA:$a); + let InOperandList = (ins OpB:$b, i32imm:$c); +} + +def InstB : InstBase { + let OutOperandList = (outs i32imm:$d); + let InOperandList = (ins unknown:$x); +} + +def InstC : InstBase { + let OutOperandList = (outs RegClass:$d); + let InOperandList = (ins RegOp:$x); +} + +// InstD has UseNamedOperandTable = 0, so it won't be handled in +// getNamedOperandIdx(). +def InstD : InstBase { + let OutOperandList = (outs RegClass:$e); + let InOperandList = (ins RegOp:$f); + let UseNamedOperandTable = 0; +} + +// CHECK: #ifdef GET_INSTRINFO_OPERAND_ENUM +// CHECK: #undef GET_INSTRINFO_OPERAND_ENUM +// CHECK: namespace llvm::MyNamespace { +// CHECK: enum class OpName { +// CHECK: a = 0, +// CHECK: b = 1, +// CHECK: c = 2, +// CHECK: d = 3, +// CHECK: x = 4, +// CHECK: NUM_OPERAND_NAMES = 5, +// CHECK: }; // enum class OpName +// CHECK: } // end namespace llvm::MyNamespace +// CHECK: #endif //GET_INSTRINFO_OPERAND_ENUM + +// CHECK: #ifdef GET_INSTRINFO_NAMED_OPS +// CHECK: #undef GET_INSTRINFO_NAMED_OPS +// CHECK: namespace llvm::MyNamespace { +// CHECK: LLVM_READONLY +// CHECK: int16_t getNamedOperandIdx(uint16_t Opcode, OpName Name) { +// CHECK: assert(Name != OpName::NUM_OPERAND_NAMES); +// CHECK: static constexpr int8_t OperandMap[][5] = { +// CHECK: {0, 1, 2, -1, -1, }, +// CHECK: {-1, -1, -1, 0, 1, }, +// CHECK: }; +// CHECK: switch(Opcode) { +// CHECK: case MyNamespace::InstA: +// CHECK: return OperandMap[0][static_cast(Name)]; +// CHECK: case MyNamespace::InstB: +// CHECK: case MyNamespace::InstC: +// CHECK: return OperandMap[1][static_cast(Name)]; +// CHECK: default: return -1; +// CHECK: } +// CHECK: } +// CHECK: } // end namespace llvm::MyNamespace +// CHECK: #endif //GET_INSTRINFO_NAMED_OPS diff --git a/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll b/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll index f706184f9727e..a3b065667702f 100644 --- a/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll +++ b/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll @@ -14,7 +14,7 @@ define ptr @b(ptr %q) { ret ptr %tmp } -; CHECK: define ptr @c(ptr readnone returned %r) +; CHECK: define ptr @c(ptr readnone returned captures(address_is_null, ret: address, provenance) %r) @g = global i32 0 define ptr @c(ptr %r) { %a = icmp eq ptr %r, null diff --git a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll index 13954694eefe0..99406696d33d1 100644 --- a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll +++ b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll @@ -145,8 +145,8 @@ return: ; preds = %cond.end, %if.then3 ; TEST SCC test returning a pointer value argument ; -; FNATTR: define ptr @ptr_sink_r0(ptr readnone returned %r) -; FNATTR: define ptr @ptr_scc_r1(ptr %a, ptr readnone %r, ptr readnone captures(none) %b) +; FNATTR: define ptr @ptr_sink_r0(ptr readnone returned captures(ret: address, provenance) %r) +; FNATTR: define ptr @ptr_scc_r1(ptr readnone %a, ptr readnone %r, ptr readnone captures(none) %b) ; FNATTR: define ptr @ptr_scc_r2(ptr readnone %a, ptr readnone %b, ptr readnone %r) ; ; @@ -260,8 +260,8 @@ entry: ; TEST another SCC test ; -; FNATTR: define ptr @rt2_helper(ptr %a) -; FNATTR: define ptr @rt2(ptr readnone %a, ptr readnone %b) +; FNATTR: define ptr @rt2_helper(ptr readnone captures(address_is_null) %a) +; FNATTR: define ptr @rt2(ptr readnone captures(address_is_null) %a, ptr readnone captures(ret: address, provenance) %b) define ptr @rt2_helper(ptr %a) #0 { entry: %call = call ptr @rt2(ptr %a, ptr %a) @@ -284,8 +284,8 @@ if.end: ; TEST another SCC test ; -; FNATTR: define ptr @rt3_helper(ptr %a, ptr %b) -; FNATTR: define ptr @rt3(ptr readnone %a, ptr readnone %b) +; FNATTR: define ptr @rt3_helper(ptr readnone captures(address_is_null) %a, ptr readnone %b) +; FNATTR: define ptr @rt3(ptr readnone captures(address_is_null) %a, ptr readnone %b) define ptr @rt3_helper(ptr %a, ptr %b) #0 { entry: %call = call ptr @rt3(ptr %a, ptr %b) @@ -316,7 +316,7 @@ if.end: ; } ; ; -; FNATTR: define ptr @calls_unknown_fn(ptr readnone returned %r) +; FNATTR: define ptr @calls_unknown_fn(ptr readnone returned captures(ret: address, provenance) %r) declare void @unknown_fn(ptr) #0 define ptr @calls_unknown_fn(ptr %r) #0 { @@ -415,7 +415,7 @@ if.end: ; preds = %if.then, %entry ; } ; ; -; FNATTR: define ptr @bitcast(ptr readnone returned %b) +; FNATTR: define ptr @bitcast(ptr readnone returned captures(ret: address, provenance) %b) ; define ptr @bitcast(ptr %b) #0 { entry: @@ -433,7 +433,7 @@ entry: ; } ; ; -; FNATTR: define ptr @bitcasts_select_and_phi(ptr readnone %b) +; FNATTR: define ptr @bitcasts_select_and_phi(ptr readnone captures(address_is_null, ret: address, provenance) %b) ; define ptr @bitcasts_select_and_phi(ptr %b) #0 { entry: @@ -462,7 +462,7 @@ if.end: ; preds = %if.then, %entry ; } ; ; -; FNATTR: define ptr @ret_arg_arg_undef(ptr readnone %b) +; FNATTR: define ptr @ret_arg_arg_undef(ptr readnone captures(address_is_null, ret: address, provenance) %b) ; define ptr @ret_arg_arg_undef(ptr %b) #0 { entry: @@ -494,7 +494,7 @@ ret_undef: ; } ; ; -; FNATTR: define ptr @ret_undef_arg_arg(ptr readnone %b) +; FNATTR: define ptr @ret_undef_arg_arg(ptr readnone captures(address_is_null, ret: address, provenance) %b) ; define ptr @ret_undef_arg_arg(ptr %b) #0 { entry: @@ -526,7 +526,7 @@ ret_arg1: ; } ; ; -; FNATTR: define ptr @ret_undef_arg_undef(ptr readnone %b) +; FNATTR: define ptr @ret_undef_arg_undef(ptr readnone captures(address_is_null, ret: address, provenance) %b) define ptr @ret_undef_arg_undef(ptr %b) #0 { entry: %cmp = icmp eq ptr %b, null diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll index 6164f2adbf5b9..6debe5de3966e 100644 --- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll +++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll @@ -7,7 +7,7 @@ define ptr @c1(ptr %q) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define ptr @c1 -; FNATTRS-SAME: (ptr readnone returned [[Q:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[Q:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: ret ptr [[Q]] ; ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) @@ -512,7 +512,7 @@ define void @test4_1(ptr %x4_1, i1 %c) { define ptr @test4_2(ptr %x4_2, ptr %y4_2, ptr %z4_2, i1 %c) { ; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define ptr @test4_2 -; FNATTRS-SAME: (ptr readnone captures(none) [[X4_2:%.*]], ptr readnone returned [[Y4_2:%.*]], ptr readnone captures(none) [[Z4_2:%.*]], i1 [[C:%.*]]) #[[ATTR10]] { +; FNATTRS-SAME: (ptr readnone captures(none) [[X4_2:%.*]], ptr readnone returned captures(ret: address, provenance) [[Y4_2:%.*]], ptr readnone captures(none) [[Z4_2:%.*]], i1 [[C:%.*]]) #[[ATTR10]] { ; FNATTRS-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; FNATTRS: t: ; FNATTRS-NEXT: call void @test4_1(ptr null, i1 [[C]]) @@ -740,7 +740,7 @@ define void @captureStrip(ptr %p) { define i1 @captureICmp(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @captureICmp -; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; FNATTRS-NEXT: ret i1 [[TMP1]] ; @@ -757,7 +757,7 @@ define i1 @captureICmp(ptr %x) { define i1 @captureICmpRev(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @captureICmpRev -; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr null, [[X]] ; FNATTRS-NEXT: ret i1 [[TMP1]] ; @@ -771,10 +771,29 @@ define i1 @captureICmpRev(ptr %x) { ret i1 %1 } +define i1 @captureICmpWrongPred(ptr %x) { +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; FNATTRS-LABEL: define i1 @captureICmpWrongPred +; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-NEXT: [[TMP1:%.*]] = icmp slt ptr [[X]], null +; FNATTRS-NEXT: ret i1 [[TMP1]] +; +; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; ATTRIBUTOR-LABEL: define i1 @captureICmpWrongPred +; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR0]] { +; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = icmp slt ptr [[X]], null +; ATTRIBUTOR-NEXT: ret i1 [[TMP1]] +; + %1 = icmp slt ptr %x, null + ret i1 %1 +} + +; We could infer captures(address_is_null) here, but don't bother, because +; InstCombine will optimize the GEP away. define i1 @nocaptureInboundsGEPICmp(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @nocaptureInboundsGEPICmp -; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 5 ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null ; FNATTRS-NEXT: ret i1 [[TMP2]] @@ -794,7 +813,7 @@ define i1 @nocaptureInboundsGEPICmp(ptr %x) { define i1 @nocaptureInboundsGEPICmpRev(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @nocaptureInboundsGEPICmpRev -; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 5 ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr null, [[TMP1]] ; FNATTRS-NEXT: ret i1 [[TMP2]] @@ -811,6 +830,46 @@ define i1 @nocaptureInboundsGEPICmpRev(ptr %x) { ret i1 %2 } +define i1 @notInboundsGEPICmp(ptr %x) { +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; FNATTRS-LABEL: define i1 @notInboundsGEPICmp +; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 +; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null +; FNATTRS-NEXT: ret i1 [[TMP2]] +; +; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; ATTRIBUTOR-LABEL: define i1 @notInboundsGEPICmp +; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR0]] { +; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 +; ATTRIBUTOR-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null +; ATTRIBUTOR-NEXT: ret i1 [[TMP2]] +; + %1 = getelementptr i32, ptr %x, i32 5 + %2 = icmp eq ptr %1, null + ret i1 %2 +} + +define i1 @inboundsGEPICmpNullPointerDefined(ptr %x) null_pointer_is_valid { +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) +; FNATTRS-LABEL: define i1 @inboundsGEPICmpNullPointerDefined +; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR16:[0-9]+]] { +; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 +; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null +; FNATTRS-NEXT: ret i1 [[TMP2]] +; +; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) +; ATTRIBUTOR-LABEL: define i1 @inboundsGEPICmpNullPointerDefined +; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR12:[0-9]+]] { +; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 +; ATTRIBUTOR-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null +; ATTRIBUTOR-NEXT: ret i1 [[TMP2]] +; + %1 = getelementptr i32, ptr %x, i32 5 + %2 = icmp eq ptr %1, null + ret i1 %2 +} + define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define noundef i1 @nocaptureDereferenceableOrNullICmp @@ -831,13 +890,13 @@ define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) define i1 @captureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) null_pointer_is_valid { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) ; FNATTRS-LABEL: define noundef i1 @captureDereferenceableOrNullICmp -; FNATTRS-SAME: (ptr readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR16:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone captures(address_is_null) dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR16]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; FNATTRS-NEXT: ret i1 [[TMP1]] ; ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) ; ATTRIBUTOR-LABEL: define i1 @captureDereferenceableOrNullICmp -; ATTRIBUTOR-SAME: (ptr nofree readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR12:[0-9]+]] { +; ATTRIBUTOR-SAME: (ptr nofree readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR12]] { ; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; ATTRIBUTOR-NEXT: ret i1 [[TMP1]] ; @@ -903,7 +962,7 @@ define void @readnone_indirec(ptr %f, ptr %p) { define ptr @captures_ret_only(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define ptr @captures_ret_only -; FNATTRS-SAME: (ptr readnone [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[P]], i64 8 ; FNATTRS-NEXT: ret ptr [[GEP]] ; @@ -917,6 +976,8 @@ define ptr @captures_ret_only(ptr %p) { ret ptr %gep } +; Even though the ptrtoint is only used in the return value, this should *not* +; be considered a read-only capture. define i64 @captures_not_ret_only(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i64 @captures_not_ret_only @@ -935,35 +996,52 @@ define i64 @captures_not_ret_only(ptr %p) { } define void @captures_read_provenance(ptr %p) { -; COMMON-LABEL: define void @captures_read_provenance -; COMMON-SAME: (ptr [[P:%.*]]) { -; COMMON-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; COMMON-NEXT: ret void +; FNATTRS-LABEL: define void @captures_read_provenance +; FNATTRS-SAME: (ptr captures(address, read_provenance) [[P:%.*]]) { +; FNATTRS-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define void @captures_read_provenance +; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { +; ATTRIBUTOR-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) +; ATTRIBUTOR-NEXT: ret void ; call void @capture(ptr captures(address, read_provenance) %p) ret void } define void @captures_unused_ret(ptr %p) { -; COMMON-LABEL: define void @captures_unused_ret -; COMMON-SAME: (ptr [[P:%.*]]) { -; COMMON-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; COMMON-NEXT: ret void +; FNATTRS-LABEL: define void @captures_unused_ret +; FNATTRS-SAME: (ptr captures(address_is_null) [[P:%.*]]) { +; FNATTRS-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define void @captures_unused_ret +; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { +; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) +; ATTRIBUTOR-NEXT: ret void ; call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) %p) ret void } define ptr @captures_used_ret(ptr %p) { -; COMMON-LABEL: define ptr @captures_used_ret -; COMMON-SAME: (ptr [[P:%.*]]) { -; COMMON-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; COMMON-NEXT: ret ptr [[RET]] +; FNATTRS-LABEL: define ptr @captures_used_ret +; FNATTRS-SAME: (ptr captures(address_is_null, ret: address, provenance) [[P:%.*]]) { +; FNATTRS-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) +; FNATTRS-NEXT: ret ptr [[RET]] +; +; ATTRIBUTOR-LABEL: define ptr @captures_used_ret +; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { +; ATTRIBUTOR-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) +; ATTRIBUTOR-NEXT: ret ptr [[RET]] ; %ret = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) %p) ret ptr %ret } +; Make sure this is does not produce captures(ret: ...). We need to take the +; return capture components into account when handling argument SCCs. define ptr @scc_capture_via_ret(i1 %c, ptr %p) { ; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define ptr @scc_capture_via_ret @@ -999,5 +1077,72 @@ else: ret ptr %p } +define i1 @improve_existing_captures(ptr captures(address) %p) { +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; FNATTRS-LABEL: define i1 @improve_existing_captures +; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null +; FNATTRS-NEXT: ret i1 [[CMP]] +; +; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; ATTRIBUTOR-LABEL: define i1 @improve_existing_captures +; ATTRIBUTOR-SAME: (ptr nofree readnone captures(address) [[P:%.*]]) #[[ATTR0]] { +; ATTRIBUTOR-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null +; ATTRIBUTOR-NEXT: ret i1 [[CMP]] +; + %cmp = icmp eq ptr %p, null + ret i1 %cmp +} + +define void @dont_increase_existing_captures(ptr captures(address) %p) { +; COMMON-LABEL: define void @dont_increase_existing_captures +; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { +; COMMON-NEXT: call void @capture(ptr [[P]]) +; COMMON-NEXT: ret void +; + call void @capture(ptr %p) + ret void +} + +define void @dont_increase_existing_captures_trivial_scc(ptr captures(address) %p) { +; COMMON-LABEL: define void @dont_increase_existing_captures_trivial_scc +; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { +; COMMON-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) +; COMMON-NEXT: call void @dont_increase_existing_captures_trivial_scc(ptr [[P]]) +; COMMON-NEXT: ret void +; + call void @capture(ptr captures(address, read_provenance) %p) + call void @dont_increase_existing_captures_trivial_scc(ptr %p) + ret void +} + +define void @dont_increase_existing_captures_scc1(ptr captures(address) %p) { +; COMMON-LABEL: define void @dont_increase_existing_captures_scc1 +; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { +; COMMON-NEXT: call void @dont_increase_existing_captures_scc2(ptr [[P]]) +; COMMON-NEXT: ret void +; + call void @dont_increase_existing_captures_scc2(ptr %p) + ret void +} + +define void @dont_increase_existing_captures_scc2(ptr %p) { +; FNATTRS-LABEL: define void @dont_increase_existing_captures_scc2 +; FNATTRS-SAME: (ptr captures(address, read_provenance) [[P:%.*]]) { +; FNATTRS-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) +; FNATTRS-NEXT: call void @dont_increase_existing_captures_scc1(ptr [[P]]) +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define void @dont_increase_existing_captures_scc2 +; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { +; ATTRIBUTOR-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) +; ATTRIBUTOR-NEXT: call void @dont_increase_existing_captures_scc1(ptr [[P]]) +; ATTRIBUTOR-NEXT: ret void +; + call void @capture(ptr captures(address, read_provenance) %p) + call void @dont_increase_existing_captures_scc1(ptr %p) + ret void +} + declare ptr @llvm.launder.invariant.group.p0(ptr) declare ptr @llvm.strip.invariant.group.p0(ptr) diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll index 0f6762f0d4342..94093568419af 100644 --- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll +++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll @@ -19,7 +19,7 @@ define ptr @test1() { ; Return a pointer trivially nonnull (argument attribute) define ptr @test2(ptr nonnull %p) { ; FNATTRS-LABEL: define nonnull ptr @test2( -; FNATTRS-SAME: ptr nonnull readnone returned [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: ptr nonnull readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: ret ptr [[P]] ; ; ATTRIBUTOR-LABEL: define nonnull ptr @test2( @@ -194,7 +194,7 @@ exit: define ptr @test7(ptr %a) { ; FNATTRS-LABEL: define ptr @test7( -; FNATTRS-SAME: ptr readnone returned [[A:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone returned captures(ret: address, provenance) [[A:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: ret ptr [[A]] ; ; ATTRIBUTOR-LABEL: define ptr @test7( @@ -206,7 +206,7 @@ define ptr @test7(ptr %a) { define ptr @test8(ptr %a) { ; FNATTRS-LABEL: define nonnull ptr @test8( -; FNATTRS-SAME: ptr readnone [[A:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1 ; FNATTRS-NEXT: ret ptr [[B]] ; @@ -221,7 +221,7 @@ define ptr @test8(ptr %a) { define ptr @test9(ptr %a, i64 %n) { ; FNATTRS-LABEL: define ptr @test9( -; FNATTRS-SAME: ptr readnone [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]] ; FNATTRS-NEXT: ret ptr [[B]] ; @@ -238,7 +238,7 @@ declare void @llvm.assume(i1) ; FIXME: missing nonnull define ptr @test10(ptr %a, i64 %n) { ; FNATTRS-LABEL: define ptr @test10( -; FNATTRS-SAME: ptr readnone [[A:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] { +; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] { ; FNATTRS-NEXT: [[CMP:%.*]] = icmp ne i64 [[N]], 0 ; FNATTRS-NEXT: call void @llvm.assume(i1 [[CMP]]) ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]] @@ -263,7 +263,7 @@ define ptr @test10(ptr %a, i64 %n) { ; } define ptr @test11(ptr) local_unnamed_addr { ; FNATTRS-LABEL: define nonnull ptr @test11( -; FNATTRS-SAME: ptr readnone [[TMP0:%.*]]) local_unnamed_addr { +; FNATTRS-SAME: ptr readnone captures(address_is_null, ret: address, provenance) [[TMP0:%.*]]) local_unnamed_addr { ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null ; FNATTRS-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP5:%.*]] ; FNATTRS: 3: @@ -362,7 +362,7 @@ declare nonnull ptr @nonnull() define internal ptr @f1(ptr %arg) { ; FIXME: missing nonnull It should be nonnull @f1(ptr nonnull readonly %arg) ; FNATTRS-LABEL: define internal nonnull ptr @f1( -; FNATTRS-SAME: ptr readonly [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { +; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = icmp eq ptr [[ARG]], null ; FNATTRS-NEXT: br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]] @@ -431,7 +431,7 @@ bb9: ; preds = %bb4, %bb define internal ptr @f2(ptr %arg) { ; FIXME: missing nonnull. It should be nonnull @f2(ptr nonnull %arg) ; FNATTRS-LABEL: define internal nonnull ptr @f2( -; FNATTRS-SAME: ptr [[ARG:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = tail call ptr @f1(ptr [[ARG]]) ; FNATTRS-NEXT: ret ptr [[TMP]] @@ -452,7 +452,7 @@ bb: define dso_local noalias ptr @f3(ptr %arg) { ; FIXME: missing nonnull. It should be nonnull @f3(ptr nonnull readonly %arg) ; FNATTRS-LABEL: define dso_local noalias nonnull ptr @f3( -; FNATTRS-SAME: ptr [[ARG:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = call ptr @f1(ptr [[ARG]]) ; FNATTRS-NEXT: ret ptr [[TMP]] @@ -945,7 +945,7 @@ exc: define ptr @gep1(ptr %p) { ; FNATTRS-LABEL: define nonnull ptr @gep1( -; FNATTRS-SAME: ptr readnone [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1 ; FNATTRS-NEXT: ret ptr [[Q]] ; @@ -961,7 +961,7 @@ define ptr @gep1(ptr %p) { define ptr @gep1_no_null_opt(ptr %p) #0 { ; Should't be able to derive nonnull based on gep. ; FNATTRS-LABEL: define ptr @gep1_no_null_opt( -; FNATTRS-SAME: ptr readnone [[P:%.*]]) #[[ATTR8:[0-9]+]] { +; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR8:[0-9]+]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1 ; FNATTRS-NEXT: ret ptr [[Q]] ; @@ -976,7 +976,7 @@ define ptr @gep1_no_null_opt(ptr %p) #0 { define ptr addrspace(3) @gep2(ptr addrspace(3) %p) { ; FNATTRS-LABEL: define ptr addrspace(3) @gep2( -; FNATTRS-SAME: ptr addrspace(3) readnone [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr addrspace(3) readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i32 1 ; FNATTRS-NEXT: ret ptr addrspace(3) [[Q]] ; @@ -992,7 +992,7 @@ define ptr addrspace(3) @gep2(ptr addrspace(3) %p) { ; FIXME: We should propagate dereferenceable here but *not* nonnull define ptr addrspace(3) @as(ptr addrspace(3) dereferenceable(4) %p) { ; FNATTRS-LABEL: define noundef ptr addrspace(3) @as( -; FNATTRS-SAME: ptr addrspace(3) readnone returned dereferenceable(4) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr addrspace(3) readnone returned captures(ret: address, provenance) dereferenceable(4) [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: ret ptr addrspace(3) [[P]] ; ; ATTRIBUTOR-LABEL: define ptr addrspace(3) @as( @@ -1383,7 +1383,7 @@ define void @PR43833_simple(ptr %0, i32 %1) { define ptr @pr91177_non_inbounds_gep(ptr nonnull %arg) { ; FNATTRS-LABEL: define ptr @pr91177_non_inbounds_gep( -; FNATTRS-SAME: ptr nonnull readnone [[ARG:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr nonnull readnone captures(ret: address, provenance) [[ARG:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[RES:%.*]] = getelementptr i8, ptr [[ARG]], i64 -8 ; FNATTRS-NEXT: ret ptr [[RES]] ; diff --git a/llvm/test/Transforms/FunctionAttrs/noundef.ll b/llvm/test/Transforms/FunctionAttrs/noundef.ll index b7c583880501a..4f53c08804621 100644 --- a/llvm/test/Transforms/FunctionAttrs/noundef.ll +++ b/llvm/test/Transforms/FunctionAttrs/noundef.ll @@ -169,7 +169,7 @@ define i64 @test_trunc_with_constexpr() { define align 4 ptr @maybe_not_aligned(ptr noundef %p) { ; CHECK-LABEL: define align 4 ptr @maybe_not_aligned( -; CHECK-SAME: ptr noundef readnone returned [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -177,7 +177,7 @@ define align 4 ptr @maybe_not_aligned(ptr noundef %p) { define align 4 ptr @definitely_aligned(ptr noundef align 4 %p) { ; CHECK-LABEL: define noundef align 4 ptr @definitely_aligned( -; CHECK-SAME: ptr noundef readnone returned align 4 [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned align 4 captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -185,7 +185,7 @@ define align 4 ptr @definitely_aligned(ptr noundef align 4 %p) { define nonnull ptr @maybe_not_nonnull(ptr noundef %p) { ; CHECK-LABEL: define nonnull ptr @maybe_not_nonnull( -; CHECK-SAME: ptr noundef readnone returned [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -193,7 +193,7 @@ define nonnull ptr @maybe_not_nonnull(ptr noundef %p) { define nonnull ptr @definitely_nonnull(ptr noundef nonnull %p) { ; CHECK-LABEL: define noundef nonnull ptr @definitely_nonnull( -; CHECK-SAME: ptr noundef nonnull readnone returned [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef nonnull readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll index b24c097ad54d0..5fc88d623c0ec 100644 --- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll +++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll @@ -35,7 +35,7 @@ define void @test1_2(ptr %x1_2, ptr %y1_2, ptr %z1_2) { define ptr @test2(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define {{[^@]+}}@test2 -; FNATTRS-SAME: (ptr readnone returned [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: store i32 0, ptr @x, align 4 ; FNATTRS-NEXT: ret ptr [[P]] ; @@ -58,7 +58,7 @@ define ptr @test2(ptr %p) { define i1 @test3(ptr %p, ptr %q) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define {{[^@]+}}@test3 -; FNATTRS-SAME: (ptr readnone [[P:%.*]], ptr readnone [[Q:%.*]]) #[[ATTR1:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone captures(address) [[P:%.*]], ptr readnone captures(address) [[Q:%.*]]) #[[ATTR1:[0-9]+]] { ; FNATTRS-NEXT: [[A:%.*]] = icmp ult ptr [[P]], [[Q]] ; FNATTRS-NEXT: ret i1 [[A]] ; @@ -197,7 +197,7 @@ define void @test7_2(ptr preallocated(i32) %a) { define ptr @test8_1(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define {{[^@]+}}@test8_1 -; FNATTRS-SAME: (ptr readnone returned [[P:%.*]]) #[[ATTR1]] { +; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR1]] { ; FNATTRS-NEXT: entry: ; FNATTRS-NEXT: ret ptr [[P]] ; @@ -220,7 +220,7 @@ entry: define void @test8_2(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) ; FNATTRS-LABEL: define {{[^@]+}}@test8_2 -; FNATTRS-SAME: (ptr writeonly [[P:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: (ptr writeonly captures(none) [[P:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: entry: ; FNATTRS-NEXT: [[CALL:%.*]] = call ptr @test8_1(ptr [[P]]) ; FNATTRS-NEXT: store i32 10, ptr [[CALL]], align 4 diff --git a/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll b/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll new file mode 100644 index 0000000000000..4d5db3263f527 --- /dev/null +++ b/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -S -passes=function-attrs < %s | FileCheck --check-prefixes=COMMON,FNATTRS %s +; RUN: opt -S -passes=attributor-light < %s | FileCheck --check-prefixes=COMMON,ATTRIBUTOR %s + +; Make sure norecurse is inferred on the calling functions + +define internal void @sendmsg_is_norecurse() { +; FNATTRS: Function Attrs: mustprogress norecurse nounwind willreturn +; FNATTRS-LABEL: define internal void @sendmsg_is_norecurse( +; FNATTRS-SAME: ) #[[ATTR0:[0-9]+]] { +; FNATTRS-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0) +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR: Function Attrs: mustprogress norecurse nounwind willreturn +; ATTRIBUTOR-LABEL: define internal void @sendmsg_is_norecurse( +; ATTRIBUTOR-SAME: ) #[[ATTR0:[0-9]+]] { +; ATTRIBUTOR-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0) #[[ATTR4:[0-9]+]] +; ATTRIBUTOR-NEXT: ret void +; + call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0) + ret void +} + +define internal void @sendmsghalt_is_norecurse() { +; COMMON: Function Attrs: norecurse nounwind +; COMMON-LABEL: define internal void @sendmsghalt_is_norecurse( +; COMMON-SAME: ) #[[ATTR1:[0-9]+]] { +; COMMON-NEXT: call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0) +; COMMON-NEXT: ret void +; + call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0) + ret void +} + +define internal i32 @sendmsg_rtn_is_norecurse() { +; FNATTRS: Function Attrs: mustprogress norecurse nounwind willreturn +; FNATTRS-LABEL: define internal i32 @sendmsg_rtn_is_norecurse( +; FNATTRS-SAME: ) #[[ATTR0]] { +; FNATTRS-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 1) +; FNATTRS-NEXT: ret i32 [[RES]] +; +; ATTRIBUTOR: Function Attrs: mustprogress norecurse nounwind willreturn +; ATTRIBUTOR-LABEL: define internal i32 @sendmsg_rtn_is_norecurse( +; ATTRIBUTOR-SAME: ) #[[ATTR0]] { +; ATTRIBUTOR-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 1) #[[ATTR4]] +; ATTRIBUTOR-NEXT: ret i32 [[RES]] +; + %res = call i32 @llvm.amdgcn.s.sendmsg.rtn(i32 1) + ret i32 %res +} + +define void @user() { +; FNATTRS-LABEL: define void @user() { +; FNATTRS-NEXT: call void @sendmsg_is_norecurse() +; FNATTRS-NEXT: call void @sendmsghalt_is_norecurse() +; FNATTRS-NEXT: call void @sendmsg_rtn_is_norecurse() +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR: Function Attrs: norecurse nounwind +; ATTRIBUTOR-LABEL: define void @user( +; ATTRIBUTOR-SAME: ) #[[ATTR1]] { +; ATTRIBUTOR-NEXT: call void @sendmsg_is_norecurse() #[[ATTR5:[0-9]+]] +; ATTRIBUTOR-NEXT: call void @sendmsghalt_is_norecurse() #[[ATTR6:[0-9]+]] +; ATTRIBUTOR-NEXT: call void @sendmsg_rtn_is_norecurse() #[[ATTR6]] +; ATTRIBUTOR-NEXT: ret void +; + call void @sendmsg_is_norecurse() + call void @sendmsghalt_is_norecurse() + call void @sendmsg_rtn_is_norecurse() + ret void +} +;. +; FNATTRS: attributes #[[ATTR0]] = { mustprogress norecurse nounwind willreturn } +; FNATTRS: attributes #[[ATTR1]] = { norecurse nounwind } +; FNATTRS: attributes #[[ATTR2:[0-9]+]] = { nocallback nounwind willreturn } +; FNATTRS: attributes #[[ATTR3:[0-9]+]] = { nocallback nounwind } +;. +; ATTRIBUTOR: attributes #[[ATTR0]] = { mustprogress norecurse nounwind willreturn } +; ATTRIBUTOR: attributes #[[ATTR1]] = { norecurse nounwind } +; ATTRIBUTOR: attributes #[[ATTR2:[0-9]+]] = { nocallback nounwind willreturn } +; ATTRIBUTOR: attributes #[[ATTR3:[0-9]+]] = { nocallback nounwind } +; ATTRIBUTOR: attributes #[[ATTR4]] = { willreturn } +; ATTRIBUTOR: attributes #[[ATTR5]] = { nounwind willreturn } +; ATTRIBUTOR: attributes #[[ATTR6]] = { nounwind } +;. diff --git a/llvm/test/Transforms/FunctionAttrs/stats.ll b/llvm/test/Transforms/FunctionAttrs/stats.ll index 5f007b4078ff3..dc0387e57174a 100644 --- a/llvm/test/Transforms/FunctionAttrs/stats.ll +++ b/llvm/test/Transforms/FunctionAttrs/stats.ll @@ -16,8 +16,8 @@ entry: ret void } -; CHECK: 2 function-attrs - Number of functions with improved memory attribute -; CHECK-NEXT: 1 function-attrs - Number of arguments marked nocapture +; CHECK: 1 function-attrs - Number of arguments marked captures(none) +; CHECK-NEXT: 2 function-attrs - Number of functions with improved memory attribute ; CHECK-NEXT: 1 function-attrs - Number of functions marked as nofree ; CHECK-NEXT: 2 function-attrs - Number of functions marked as norecurse ; CHECK-NEXT: 2 function-attrs - Number of functions marked as nosync diff --git a/llvm/test/Transforms/InstCombine/add2.ll b/llvm/test/Transforms/InstCombine/add2.ll index c474a33c48a2b..375f2616a3028 100644 --- a/llvm/test/Transforms/InstCombine/add2.ll +++ b/llvm/test/Transforms/InstCombine/add2.ll @@ -325,7 +325,8 @@ define i16 @mul_add_to_mul_9(i16 %a) { define i32 @shl_add_to_shl_constexpr() { ; CHECK-LABEL: @shl_add_to_shl_constexpr( -; CHECK-NEXT: ret i32 mul (i32 ptrtoint (ptr @g to i32), i32 4) +; CHECK-NEXT: [[ADD:%.*]] = shl i32 ptrtoint (ptr @g to i32), 2 +; CHECK-NEXT: ret i32 [[ADD]] ; %shl = shl i32 ptrtoint (ptr @g to i32), 1 %add = add i32 %shl, %shl diff --git a/llvm/test/Transforms/InstCombine/ashr-lshr.ll b/llvm/test/Transforms/InstCombine/ashr-lshr.ll index 60debf02b5862..db0ecbfed1e8e 100644 --- a/llvm/test/Transforms/InstCombine/ashr-lshr.ll +++ b/llvm/test/Transforms/InstCombine/ashr-lshr.ll @@ -1077,4 +1077,148 @@ entry: ret i32 %shr } +define i32 @ashr_shift_mul(i32 %x) { +; CHECK-LABEL: @ashr_shift_mul( +; CHECK-NEXT: [[A:%.*]] = ashr exact i32 [[X:%.*]], 3 +; CHECK-NEXT: [[RES:%.*]] = add i32 [[X]], [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = ashr exact i32 %x, 3 + %res = mul i32 %a, 9 + ret i32 %res +} + +define i32 @ashr_shift_mul_nuw(i32 %x) { +; CHECK-LABEL: @ashr_shift_mul_nuw( +; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 3 +; CHECK-NEXT: [[RES:%.*]] = add nuw i32 [[X]], [[TMP1]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = ashr exact i32 %x, 3 + %res = mul nuw i32 %a, 9 + ret i32 %res +} + +define i32 @ashr_shift_mul_nsw(i32 %x) { +; CHECK-LABEL: @ashr_shift_mul_nsw( +; CHECK-NEXT: [[A:%.*]] = ashr exact i32 [[X:%.*]], 3 +; CHECK-NEXT: [[RES:%.*]] = add nsw i32 [[X]], [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = ashr exact i32 %x, 3 + %res = mul nsw i32 %a, 9 + ret i32 %res +} + +define i32 @lshr_shift_mul_nuw(i32 %x) { +; CHECK-LABEL: @lshr_shift_mul_nuw( +; CHECK-NEXT: [[A:%.*]] = lshr exact i32 [[X:%.*]], 3 +; CHECK-NEXT: [[RES:%.*]] = add nuw i32 [[X]], [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = lshr exact i32 %x, 3 + %res = mul nuw i32 %a, 9 + ret i32 %res +} + +define i32 @lshr_shift_mul(i32 %x) { +; CHECK-LABEL: @lshr_shift_mul( +; CHECK-NEXT: [[A:%.*]] = lshr exact i32 [[X:%.*]], 3 +; CHECK-NEXT: [[RES:%.*]] = add i32 [[X]], [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = lshr exact i32 %x, 3 + %res = mul i32 %a, 9 + ret i32 %res +} + +define i32 @lshr_shift_mul_nsw(i32 %x) { +; CHECK-LABEL: @lshr_shift_mul_nsw( +; CHECK-NEXT: [[A:%.*]] = lshr exact i32 [[X:%.*]], 3 +; CHECK-NEXT: [[RES:%.*]] = add nsw i32 [[X]], [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = lshr exact i32 %x, 3 + %res = mul nsw i32 %a, 9 + ret i32 %res +} + +; Negative test + +define i32 @lshr_no_exact(i32 %x) { +; CHECK-LABEL: @lshr_no_exact( +; CHECK-NEXT: [[A:%.*]] = lshr i32 [[X:%.*]], 3 +; CHECK-NEXT: [[RES:%.*]] = mul nuw nsw i32 [[A]], 9 +; CHECK-NEXT: ret i32 [[RES]] +; + %a = lshr i32 %x, 3 + %res = mul nsw i32 %a, 9 + ret i32 %res +} + +; Negative test + +define i32 @ashr_no_exact(i32 %x) { +; CHECK-LABEL: @ashr_no_exact( +; CHECK-NEXT: [[A:%.*]] = ashr i32 [[X:%.*]], 3 +; CHECK-NEXT: [[RES:%.*]] = mul nsw i32 [[A]], 9 +; CHECK-NEXT: ret i32 [[RES]] +; + %a = ashr i32 %x, 3 + %res = mul nsw i32 %a, 9 + ret i32 %res +} + +define i32 @lshr_multiuse(i32 %x) { +; CHECK-LABEL: @lshr_multiuse( +; CHECK-NEXT: [[A:%.*]] = lshr exact i32 [[X:%.*]], 3 +; CHECK-NEXT: call void @use(i32 [[A]]) +; CHECK-NEXT: [[RES:%.*]] = add nsw i32 [[X]], [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = lshr exact i32 %x, 3 + call void @use(i32 %a) + %res = mul nsw i32 %a, 9 + ret i32 %res +} + +define i32 @lshr_multiuse_no_flags(i32 %x) { +; CHECK-LABEL: @lshr_multiuse_no_flags( +; CHECK-NEXT: [[A:%.*]] = lshr exact i32 [[X:%.*]], 3 +; CHECK-NEXT: call void @use(i32 [[A]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[X]], [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = lshr exact i32 %x, 3 + call void @use(i32 %a) + %res = mul i32 %a, 9 + ret i32 %res +} + +define i32 @ashr_multiuse_no_flags(i32 %x) { +; CHECK-LABEL: @ashr_multiuse_no_flags( +; CHECK-NEXT: [[A:%.*]] = ashr exact i32 [[X:%.*]], 3 +; CHECK-NEXT: call void @use(i32 [[A]]) +; CHECK-NEXT: [[RES:%.*]] = add i32 [[X]], [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = ashr exact i32 %x, 3 + call void @use(i32 %a) + %res = mul i32 %a, 9 + ret i32 %res +} + +define i32 @ashr_multiuse(i32 %x) { +; CHECK-LABEL: @ashr_multiuse( +; CHECK-NEXT: [[A:%.*]] = ashr exact i32 [[X:%.*]], 3 +; CHECK-NEXT: call void @use(i32 [[A]]) +; CHECK-NEXT: [[RES:%.*]] = add nsw i32 [[X]], [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %a = ashr exact i32 %x, 3 + call void @use(i32 %a) + %res = mul nsw i32 %a, 9 + ret i32 %res +} + declare void @use(i32) diff --git a/llvm/test/Transforms/InstCombine/icmp-dom.ll b/llvm/test/Transforms/InstCombine/icmp-dom.ll index 3cf3a7af77041..07793e8951de5 100644 --- a/llvm/test/Transforms/InstCombine/icmp-dom.ll +++ b/llvm/test/Transforms/InstCombine/icmp-dom.ll @@ -534,3 +534,203 @@ else: %cmp1 = icmp eq i32 %and1, 0 ret i1 %cmp1 } + +; TODO: X != Y implies X | Y != 0 +define i1 @or_nonzero_from_nonequal(i8 %x, i8 %y) { +; CHECK-LABEL: @or_nonzero_from_nonequal( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: br i1 [[COND]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[OR:%.*]] = or i8 [[X]], [[Y]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[OR]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; +entry: + %cond = icmp eq i8 %x, %y + br i1 %cond, label %if.else, label %if.then + +if.then: + %or = or i8 %x, %y + %cmp = icmp eq i8 %or, 0 + ret i1 %cmp + +if.else: + ret i1 false +} + +define i1 @test_nonequal_domcond1(i64 %x, i64 %y, i64 %z, i64 %w) { +; CHECK-LABEL: @test_nonequal_domcond1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND1:%.*]] = icmp eq i64 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[COND2:%.*]] = icmp eq i64 [[W:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[COND1]], i1 true, i1 [[COND2]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret i1 false +; CHECK: if.end: +; CHECK-NEXT: ret i1 false +; +entry: + %cond1 = icmp eq i64 %y, %x + %cond2 = icmp eq i64 %w, %z + %or.cond = select i1 %cond1, i1 true, i1 %cond2 + br i1 %or.cond, label %if.end, label %if.then + +if.then: + %sub1 = sub i64 %w, %z + %sub2 = sub i64 %y, %x + %umin = call i64 @llvm.umin.i64(i64 %sub1, i64 %sub2) + %cmp = icmp eq i64 %umin, 0 + ret i1 %cmp + +if.end: + ret i1 false +} + +define i1 @test_nonequal_domcond2(i64 %x, i64 %y, i64 %z, i64 %w) { +; CHECK-LABEL: @test_nonequal_domcond2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND1:%.*]] = icmp ne i64 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[COND2:%.*]] = icmp ne i64 [[W:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[COND1]], i1 [[COND2]], i1 false +; CHECK-NEXT: br i1 [[OR_COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret i1 false +; CHECK: if.end: +; CHECK-NEXT: ret i1 false +; +entry: + %cond1 = icmp ne i64 %y, %x + %cond2 = icmp ne i64 %w, %z + %or.cond = select i1 %cond1, i1 %cond2, i1 false + br i1 %or.cond, label %if.then, label %if.end + +if.then: + %sub1 = sub i64 %w, %z + %sub2 = sub i64 %y, %x + %umin = call i64 @llvm.umin.i64(i64 %sub1, i64 %sub2) + %cmp = icmp eq i64 %umin, 0 + ret i1 %cmp + +if.end: + ret i1 false +} + +define i1 @test_nonequal_assume(i64 %x, i64 %y, i64 %z, i64 %w) { +; CHECK-LABEL: @test_nonequal_assume( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND1:%.*]] = icmp ne i64 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: call void @llvm.assume(i1 [[COND1]]) +; CHECK-NEXT: [[COND2:%.*]] = icmp ne i64 [[W:%.*]], [[Z:%.*]] +; CHECK-NEXT: call void @llvm.assume(i1 [[COND2]]) +; CHECK-NEXT: ret i1 false +; +entry: + %cond1 = icmp ne i64 %y, %x + call void @llvm.assume(i1 %cond1) + %cond2 = icmp ne i64 %w, %z + call void @llvm.assume(i1 %cond2) + + %sub1 = sub i64 %w, %z + %sub2 = sub i64 %y, %x + %umin = call i64 @llvm.umin.i64(i64 %sub1, i64 %sub2) + %cmp = icmp eq i64 %umin, 0 + ret i1 %cmp +} + +; Negative tests + +define i1 @test_nonequal_invalid_domcond1(i64 %x, i64 %y, i64 %z, i64 %w) { +; CHECK-LABEL: @test_nonequal_invalid_domcond1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND1:%.*]] = icmp ne i64 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[COND2:%.*]] = icmp eq i64 [[W:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[COND1]], i1 true, i1 [[COND2]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret i1 true +; CHECK: if.end: +; CHECK-NEXT: ret i1 false +; +entry: + %cond1 = icmp ne i64 %y, %x + %cond2 = icmp eq i64 %w, %z + %or.cond = select i1 %cond1, i1 true, i1 %cond2 + br i1 %or.cond, label %if.end, label %if.then + +if.then: + %sub1 = sub i64 %w, %z + %sub2 = sub i64 %y, %x + %umin = call i64 @llvm.umin.i64(i64 %sub1, i64 %sub2) + %cmp = icmp eq i64 %umin, 0 + ret i1 %cmp + +if.end: + ret i1 false +} + +define i1 @test_nonequal_invalid_domcond2(i64 %x, i64 %y, i64 %z, i64 %w) { +; CHECK-LABEL: @test_nonequal_invalid_domcond2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND1:%.*]] = icmp eq i64 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[COND2:%.*]] = icmp eq i64 [[W:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[COND1]], i1 true, i1 [[COND2]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[SUB1:%.*]] = sub i64 [[W]], [[Z]] +; CHECK-NEXT: [[SUB2:%.*]] = sub i64 [[Y]], [[X]] +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[SUB1]], i64 [[SUB2]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[UMIN]], 0 +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %cond1 = icmp eq i64 %y, %x + %cond2 = icmp eq i64 %w, %z + %or.cond = select i1 %cond1, i1 true, i1 %cond2 + br i1 %or.cond, label %if.then, label %if.end + +if.then: + br label %if.end + +if.end: + %sub1 = sub i64 %w, %z + %sub2 = sub i64 %y, %x + %umin = call i64 @llvm.umin.i64(i64 %sub1, i64 %sub2) + %cmp = icmp eq i64 %umin, 0 + ret i1 %cmp +} + +define i1 @test_nonequal_invalid_assume(i64 %x, i64 %y, i64 %z, i64 %w) { +; CHECK-LABEL: @test_nonequal_invalid_assume( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SUB1:%.*]] = sub i64 [[W:%.*]], [[Z:%.*]] +; CHECK-NEXT: [[SUB2:%.*]] = sub i64 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[SUB1]], i64 [[SUB2]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[UMIN]], 0 +; CHECK-NEXT: call void @side_effect() +; CHECK-NEXT: [[COND1:%.*]] = icmp ne i64 [[Y]], [[X]] +; CHECK-NEXT: call void @llvm.assume(i1 [[COND1]]) +; CHECK-NEXT: [[COND2:%.*]] = icmp ne i64 [[W]], [[Z]] +; CHECK-NEXT: call void @llvm.assume(i1 [[COND2]]) +; CHECK-NEXT: ret i1 [[CMP]] +; +entry: + %sub1 = sub i64 %w, %z + %sub2 = sub i64 %y, %x + %umin = call i64 @llvm.umin.i64(i64 %sub1, i64 %sub2) + %cmp = icmp eq i64 %umin, 0 + + call void @side_effect() + %cond1 = icmp ne i64 %y, %x + call void @llvm.assume(i1 %cond1) + %cond2 = icmp ne i64 %w, %z + call void @llvm.assume(i1 %cond2) + ret i1 %cmp +} + +declare void @side_effect() diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll index b729cbd971acc..9a9fec694ff0e 100644 --- a/llvm/test/Transforms/InstCombine/known-bits.ll +++ b/llvm/test/Transforms/InstCombine/known-bits.ll @@ -2167,11 +2167,9 @@ define i8 @test_trunc_and_1(i8 %a) { ; CHECK-NEXT: [[CAST:%.*]] = trunc i8 [[A:%.*]] to i1 ; CHECK-NEXT: br i1 [[CAST]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[B:%.*]] = and i8 [[A]], 1 -; CHECK-NEXT: ret i8 [[B]] +; CHECK-NEXT: ret i8 1 ; CHECK: if.else: -; CHECK-NEXT: [[C:%.*]] = and i8 [[A]], 1 -; CHECK-NEXT: ret i8 [[C]] +; CHECK-NEXT: ret i8 0 ; entry: %cast = trunc i8 %a to i1 @@ -2192,11 +2190,9 @@ define i8 @test_not_trunc_and_1(i8 %a) { ; CHECK-NEXT: [[CAST:%.*]] = trunc i8 [[A:%.*]] to i1 ; CHECK-NEXT: br i1 [[CAST]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[B:%.*]] = and i8 [[A]], 1 -; CHECK-NEXT: ret i8 [[B]] +; CHECK-NEXT: ret i8 0 ; CHECK: if.else: -; CHECK-NEXT: [[C:%.*]] = and i8 [[A]], 1 -; CHECK-NEXT: ret i8 [[C]] +; CHECK-NEXT: ret i8 1 ; entry: %cast = trunc i8 %a to i1 @@ -2243,11 +2239,9 @@ define i8 @test_trunc_nuw_and_1(i8 %a) { ; CHECK-NEXT: [[CAST:%.*]] = trunc nuw i8 [[A:%.*]] to i1 ; CHECK-NEXT: br i1 [[CAST]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[B:%.*]] = and i8 [[A]], 1 -; CHECK-NEXT: ret i8 [[B]] +; CHECK-NEXT: ret i8 0 ; CHECK: if.else: -; CHECK-NEXT: [[C:%.*]] = and i8 [[A]], 1 -; CHECK-NEXT: ret i8 [[C]] +; CHECK-NEXT: ret i8 1 ; entry: %cast = trunc nuw i8 %a to i1 @@ -2268,11 +2262,9 @@ define i8 @test_trunc_nuw_or_2(i8 %a) { ; CHECK-NEXT: [[CAST:%.*]] = trunc nuw i8 [[A:%.*]] to i1 ; CHECK-NEXT: br i1 [[CAST]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[B:%.*]] = or i8 [[A]], 2 -; CHECK-NEXT: ret i8 [[B]] +; CHECK-NEXT: ret i8 2 ; CHECK: if.else: -; CHECK-NEXT: [[C:%.*]] = or i8 [[A]], 2 -; CHECK-NEXT: ret i8 [[C]] +; CHECK-NEXT: ret i8 3 ; entry: %cast = trunc nuw i8 %a to i1 @@ -2293,11 +2285,9 @@ define i8 @test_not_trunc_nuw_and_1(i8 %a) { ; CHECK-NEXT: [[CAST:%.*]] = trunc nuw i8 [[A:%.*]] to i1 ; CHECK-NEXT: br i1 [[CAST]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[B:%.*]] = and i8 [[A]], 1 -; CHECK-NEXT: ret i8 [[B]] +; CHECK-NEXT: ret i8 0 ; CHECK: if.else: -; CHECK-NEXT: [[C:%.*]] = and i8 [[A]], 1 -; CHECK-NEXT: ret i8 [[C]] +; CHECK-NEXT: ret i8 1 ; entry: %cast = trunc nuw i8 %a to i1 @@ -2319,8 +2309,7 @@ define i8 @test_trunc_cond_and(i8 %x, i1 %c) { ; CHECK-NEXT: [[COND:%.*]] = and i1 [[C:%.*]], [[CMP]] ; CHECK-NEXT: br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]] ; CHECK: if: -; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -2 -; CHECK-NEXT: ret i8 [[OR1]] +; CHECK-NEXT: ret i8 -1 ; CHECK: exit: ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -2 ; CHECK-NEXT: ret i8 [[OR2]] @@ -2345,8 +2334,7 @@ define i8 @test_not_trunc_cond_and(i8 %x, i1 %c) { ; CHECK-NEXT: [[COND:%.*]] = and i1 [[C:%.*]], [[NOT]] ; CHECK-NEXT: br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]] ; CHECK: if: -; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -2 -; CHECK-NEXT: ret i8 [[OR1]] +; CHECK-NEXT: ret i8 -2 ; CHECK: exit: ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -2 ; CHECK-NEXT: ret i8 [[OR2]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index bedf8b6b3a9b5..4e4a5c82c298a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON ; RUN: opt --mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE -; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW +; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-width=8 -scalable-vectorization=preferred -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" @@ -22,7 +22,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NEON: vector.body: ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] @@ -37,14 +37,15 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEON-NEXT: [[TMP11:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] -; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NEON: middle.block: -; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -114,7 +115,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] @@ -129,14 +130,15 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -350,7 +352,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NEON: vector.body: ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] @@ -365,14 +367,15 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] -; CHECK-NEON-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-NEON: middle.block: -; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -442,7 +445,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] @@ -457,14 +460,15 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw zeroinitializer, [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP17]]) ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -516,7 +520,7 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NEON: vector.body: ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] @@ -531,14 +535,16 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] -; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-NEON: middle.block: -; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -608,7 +614,7 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] @@ -623,14 +629,16 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw zeroinitializer, [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP17]]) ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = sub zeroinitializer, [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP19]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -766,44 +774,44 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 ; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 ; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE-MAXBW: vector.ph: ; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 -; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 -; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP16]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP17]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP14]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE3]], [[TMP18]]) +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP17]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE3]], [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE4]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE4]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -858,7 +866,7 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NEON: vector.body: ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] @@ -873,16 +881,18 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] -; CHECK-NEON-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) ; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEON-NEXT: [[TMP15]] = sub <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEON-NEXT: [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP15]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-NEON: middle.block: -; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]]) +; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -954,7 +964,7 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] @@ -969,16 +979,18 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw zeroinitializer, [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP17]]) ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP18]] -; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw [[TMP14]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[TMP21]] = sub [[TMP19]], [[TMP20]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP18]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE3]], [[TMP20]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP21]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE4]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll new file mode 100644 index 0000000000000..37da523ed7337 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll @@ -0,0 +1,163 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP12]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[VEC_PHI]], [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP15]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sub zeroinitializer, [[TMP21]] +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sub zeroinitializer, [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[VEC_PHI]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP26]] = add [[VEC_PHI1]], [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP26]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[IV]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul [[TMP12]], [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP14]]) +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %sub = sub i32 0, %mul + %add = add i32 %accum, %sub + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +!7 = distinct !{!7, !8, !9, !10} +!8 = !{!"llvm.loop.mustprogress"} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.enable", i1 true} +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll b/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll index c49f651437cbb..10a76a9d6d01f 100644 --- a/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll +++ b/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll @@ -5,7 +5,7 @@ define i64 @test() { ; CHECK-LABEL: define i64 @test() { -; CHECK-NEXT: [[TMP1:%.*]] = tail call ptr @malloc(i64 mul (i64 ptrtoint (ptr getelementptr (i64, ptr null, i64 1) to i64), i64 4)) +; CHECK-NEXT: [[TMP1:%.*]] = tail call ptr @malloc(i64 mul (i64 4, i64 ptrtoint (ptr getelementptr (i64, ptr null, i64 1) to i64))) ; CHECK-NEXT: store i8 42, ptr [[TMP1]], align 1 ; CHECK-NEXT: [[Y:%.*]] = load i64, ptr [[TMP1]], align 4 ; CHECK-NEXT: ret i64 [[Y]] diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll index e01dba328a3a1..7175816963ed1 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll @@ -9,7 +9,7 @@ target triple = "aarch64" define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 { ; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_( -; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly captures(address_is_null) [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0 ; CHECK-NEXT: br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]] diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll new file mode 100644 index 0000000000000..3496520c232aa --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/udotabd.ll @@ -0,0 +1,499 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -O3 < %s | FileCheck %s --check-prefixes=CHECK-O3 +; RUN: opt -S -passes="default,default" < %s | FileCheck %s --check-prefixes=CHECK-LTO + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64" + +define dso_local i32 @test(ptr noundef %p1, i32 noundef %s_p1, ptr noundef %p2, i32 noundef %s_p2) #0 { +; CHECK-O3-LABEL: define dso_local i32 @test( +; CHECK-O3-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[S_P1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[S_P2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-O3-NEXT: [[ENTRY:.*:]] +; CHECK-O3-NEXT: [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64 +; CHECK-O3-NEXT: [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64 +; CHECK-O3-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] +; CHECK-O3-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]] +; CHECK-O3-NEXT: [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 false) +; CHECK-O3-NEXT: [[TMP6:%.*]] = zext <16 x i16> [[TMP5]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP7:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]]) +; CHECK-O3-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP12:%.*]] = sub nsw <16 x i16> [[TMP9]], [[TMP11]] +; CHECK-O3-NEXT: [[TMP13:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP12]], i1 false) +; CHECK-O3-NEXT: [[TMP14:%.*]] = zext <16 x i16> [[TMP13]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP15:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]]) +; CHECK-O3-NEXT: [[OP_RDX_1:%.*]] = add i32 [[TMP15]], [[TMP7]] +; CHECK-O3-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP16:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP20:%.*]] = sub nsw <16 x i16> [[TMP17]], [[TMP19]] +; CHECK-O3-NEXT: [[TMP21:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP20]], i1 false) +; CHECK-O3-NEXT: [[TMP22:%.*]] = zext <16 x i16> [[TMP21]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP23:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP22]]) +; CHECK-O3-NEXT: [[OP_RDX_2:%.*]] = add i32 [[TMP23]], [[OP_RDX_1]] +; CHECK-O3-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]] +; CHECK-O3-NEXT: [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 false) +; CHECK-O3-NEXT: [[TMP30:%.*]] = zext <16 x i16> [[TMP29]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP31:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP30]]) +; CHECK-O3-NEXT: [[OP_RDX_3:%.*]] = add i32 [[TMP31]], [[OP_RDX_2]] +; CHECK-O3-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP34:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[TMP34]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP36:%.*]] = sub nsw <16 x i16> [[TMP33]], [[TMP35]] +; CHECK-O3-NEXT: [[TMP37:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP36]], i1 false) +; CHECK-O3-NEXT: [[TMP38:%.*]] = zext <16 x i16> [[TMP37]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP39:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP38]]) +; CHECK-O3-NEXT: [[OP_RDX_4:%.*]] = add i32 [[TMP39]], [[OP_RDX_3]] +; CHECK-O3-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP40:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP41:%.*]] = zext <16 x i8> [[TMP40]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP42:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP44:%.*]] = sub nsw <16 x i16> [[TMP41]], [[TMP43]] +; CHECK-O3-NEXT: [[TMP45:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP44]], i1 false) +; CHECK-O3-NEXT: [[TMP46:%.*]] = zext <16 x i16> [[TMP45]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP47:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP46]]) +; CHECK-O3-NEXT: [[OP_RDX_5:%.*]] = add i32 [[TMP47]], [[OP_RDX_4]] +; CHECK-O3-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP48:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP50:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP51:%.*]] = zext <16 x i8> [[TMP50]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP52:%.*]] = sub nsw <16 x i16> [[TMP49]], [[TMP51]] +; CHECK-O3-NEXT: [[TMP53:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP52]], i1 false) +; CHECK-O3-NEXT: [[TMP54:%.*]] = zext <16 x i16> [[TMP53]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP55:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP54]]) +; CHECK-O3-NEXT: [[OP_RDX_6:%.*]] = add i32 [[TMP55]], [[OP_RDX_5]] +; CHECK-O3-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP56:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP57:%.*]] = zext <16 x i8> [[TMP56]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP58:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP59:%.*]] = zext <16 x i8> [[TMP58]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP60:%.*]] = sub nsw <16 x i16> [[TMP57]], [[TMP59]] +; CHECK-O3-NEXT: [[TMP61:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP60]], i1 false) +; CHECK-O3-NEXT: [[TMP62:%.*]] = zext <16 x i16> [[TMP61]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP63:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP62]]) +; CHECK-O3-NEXT: [[OP_RDX_7:%.*]] = add i32 [[TMP63]], [[OP_RDX_6]] +; CHECK-O3-NEXT: [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP64:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP65:%.*]] = zext <16 x i8> [[TMP64]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP66:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP67:%.*]] = zext <16 x i8> [[TMP66]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP68:%.*]] = sub nsw <16 x i16> [[TMP65]], [[TMP67]] +; CHECK-O3-NEXT: [[TMP69:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP68]], i1 false) +; CHECK-O3-NEXT: [[TMP70:%.*]] = zext <16 x i16> [[TMP69]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP71:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP70]]) +; CHECK-O3-NEXT: [[OP_RDX_8:%.*]] = add i32 [[TMP71]], [[OP_RDX_7]] +; CHECK-O3-NEXT: [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP72:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP73:%.*]] = zext <16 x i8> [[TMP72]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP74:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP75:%.*]] = zext <16 x i8> [[TMP74]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP76:%.*]] = sub nsw <16 x i16> [[TMP73]], [[TMP75]] +; CHECK-O3-NEXT: [[TMP77:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP76]], i1 false) +; CHECK-O3-NEXT: [[TMP78:%.*]] = zext <16 x i16> [[TMP77]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP79:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP78]]) +; CHECK-O3-NEXT: [[OP_RDX_9:%.*]] = add i32 [[TMP79]], [[OP_RDX_8]] +; CHECK-O3-NEXT: [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP80:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP81:%.*]] = zext <16 x i8> [[TMP80]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP82:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP83:%.*]] = zext <16 x i8> [[TMP82]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP84:%.*]] = sub nsw <16 x i16> [[TMP81]], [[TMP83]] +; CHECK-O3-NEXT: [[TMP85:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP84]], i1 false) +; CHECK-O3-NEXT: [[TMP86:%.*]] = zext <16 x i16> [[TMP85]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP87:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP86]]) +; CHECK-O3-NEXT: [[OP_RDX_10:%.*]] = add i32 [[TMP87]], [[OP_RDX_9]] +; CHECK-O3-NEXT: [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP88:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP89:%.*]] = zext <16 x i8> [[TMP88]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP90:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP91:%.*]] = zext <16 x i8> [[TMP90]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP92:%.*]] = sub nsw <16 x i16> [[TMP89]], [[TMP91]] +; CHECK-O3-NEXT: [[TMP93:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP92]], i1 false) +; CHECK-O3-NEXT: [[TMP94:%.*]] = zext <16 x i16> [[TMP93]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP95:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP94]]) +; CHECK-O3-NEXT: [[OP_RDX_11:%.*]] = add i32 [[TMP95]], [[OP_RDX_10]] +; CHECK-O3-NEXT: [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP96:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP97:%.*]] = zext <16 x i8> [[TMP96]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP98:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP99:%.*]] = zext <16 x i8> [[TMP98]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP100:%.*]] = sub nsw <16 x i16> [[TMP97]], [[TMP99]] +; CHECK-O3-NEXT: [[TMP101:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP100]], i1 false) +; CHECK-O3-NEXT: [[TMP102:%.*]] = zext <16 x i16> [[TMP101]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP103:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP102]]) +; CHECK-O3-NEXT: [[OP_RDX_12:%.*]] = add i32 [[TMP103]], [[OP_RDX_11]] +; CHECK-O3-NEXT: [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP104:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP105:%.*]] = zext <16 x i8> [[TMP104]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP106:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP107:%.*]] = zext <16 x i8> [[TMP106]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP108:%.*]] = sub nsw <16 x i16> [[TMP105]], [[TMP107]] +; CHECK-O3-NEXT: [[TMP109:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP108]], i1 false) +; CHECK-O3-NEXT: [[TMP110:%.*]] = zext <16 x i16> [[TMP109]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP111:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP110]]) +; CHECK-O3-NEXT: [[OP_RDX_13:%.*]] = add i32 [[TMP111]], [[OP_RDX_12]] +; CHECK-O3-NEXT: [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP112:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP113:%.*]] = zext <16 x i8> [[TMP112]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP114:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP115:%.*]] = zext <16 x i8> [[TMP114]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP116:%.*]] = sub nsw <16 x i16> [[TMP113]], [[TMP115]] +; CHECK-O3-NEXT: [[TMP117:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP116]], i1 false) +; CHECK-O3-NEXT: [[TMP118:%.*]] = zext <16 x i16> [[TMP117]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP119:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP118]]) +; CHECK-O3-NEXT: [[OP_RDX_14:%.*]] = add i32 [[TMP119]], [[OP_RDX_13]] +; CHECK-O3-NEXT: [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]] +; CHECK-O3-NEXT: [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]] +; CHECK-O3-NEXT: [[TMP120:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP121:%.*]] = zext <16 x i8> [[TMP120]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP122:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[TBAA0]] +; CHECK-O3-NEXT: [[TMP123:%.*]] = zext <16 x i8> [[TMP122]] to <16 x i16> +; CHECK-O3-NEXT: [[TMP124:%.*]] = sub nsw <16 x i16> [[TMP121]], [[TMP123]] +; CHECK-O3-NEXT: [[TMP125:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP124]], i1 false) +; CHECK-O3-NEXT: [[TMP126:%.*]] = zext <16 x i16> [[TMP125]] to <16 x i32> +; CHECK-O3-NEXT: [[TMP127:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP126]]) +; CHECK-O3-NEXT: [[OP_RDX_15:%.*]] = add i32 [[TMP127]], [[OP_RDX_14]] +; CHECK-O3-NEXT: ret i32 [[OP_RDX_15]] +; +; CHECK-LTO-LABEL: define dso_local i32 @test( +; CHECK-LTO-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[S_P1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[S_P2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-LTO-NEXT: [[ENTRY:.*:]] +; CHECK-LTO-NEXT: [[IDX_EXT8:%.*]] = sext i32 [[S_P2]] to i64 +; CHECK-LTO-NEXT: [[IDX_EXT:%.*]] = sext i32 [[S_P1]] to i64 +; CHECK-LTO-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] +; CHECK-LTO-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[TMP0]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP4:%.*]] = sub nsw <16 x i16> [[TMP1]], [[TMP3]] +; CHECK-LTO-NEXT: [[TMP5:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP4]], i1 true) +; CHECK-LTO-NEXT: [[TMP36:%.*]] = zext nneg <16 x i16> [[TMP5]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP44:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP36]]) +; CHECK-LTO-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr [[ADD_PTR]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[TMP6]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr [[ADD_PTR9]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP10:%.*]] = sub nsw <16 x i16> [[TMP7]], [[TMP9]] +; CHECK-LTO-NEXT: [[TMP11:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP10]], i1 true) +; CHECK-LTO-NEXT: [[TMP52:%.*]] = zext nneg <16 x i16> [[TMP11]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP60:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP52]]) +; CHECK-LTO-NEXT: [[OP_RDX_1:%.*]] = add i32 [[TMP60]], [[TMP44]] +; CHECK-LTO-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[ADD_PTR_1]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_1]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP16:%.*]] = sub nsw <16 x i16> [[TMP13]], [[TMP15]] +; CHECK-LTO-NEXT: [[TMP17:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP16]], i1 true) +; CHECK-LTO-NEXT: [[TMP68:%.*]] = zext nneg <16 x i16> [[TMP17]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP76:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP68]]) +; CHECK-LTO-NEXT: [[OP_RDX_2:%.*]] = add i32 [[OP_RDX_1]], [[TMP76]] +; CHECK-LTO-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_1]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP18:%.*]] = load <16 x i8>, ptr [[ADD_PTR_2]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP20:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_2]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP22:%.*]] = sub nsw <16 x i16> [[TMP19]], [[TMP21]] +; CHECK-LTO-NEXT: [[TMP23:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP22]], i1 true) +; CHECK-LTO-NEXT: [[TMP84:%.*]] = zext nneg <16 x i16> [[TMP23]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP92:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP84]]) +; CHECK-LTO-NEXT: [[OP_RDX_3:%.*]] = add i32 [[OP_RDX_2]], [[TMP92]] +; CHECK-LTO-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_2]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP24:%.*]] = load <16 x i8>, ptr [[ADD_PTR_3]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP26:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_3]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP28:%.*]] = sub nsw <16 x i16> [[TMP25]], [[TMP27]] +; CHECK-LTO-NEXT: [[TMP29:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP28]], i1 true) +; CHECK-LTO-NEXT: [[TMP100:%.*]] = zext nneg <16 x i16> [[TMP29]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP108:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP100]]) +; CHECK-LTO-NEXT: [[OP_RDX_4:%.*]] = add i32 [[OP_RDX_3]], [[TMP108]] +; CHECK-LTO-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_4:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_3]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP30:%.*]] = load <16 x i8>, ptr [[ADD_PTR_4]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP32:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_4]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP34:%.*]] = sub nsw <16 x i16> [[TMP31]], [[TMP33]] +; CHECK-LTO-NEXT: [[TMP35:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP34]], i1 true) +; CHECK-LTO-NEXT: [[TMP116:%.*]] = zext nneg <16 x i16> [[TMP35]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP117:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP116]]) +; CHECK-LTO-NEXT: [[OP_RDX_5:%.*]] = add i32 [[OP_RDX_4]], [[TMP117]] +; CHECK-LTO-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_5:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_4]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP37:%.*]] = load <16 x i8>, ptr [[ADD_PTR_5]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP38:%.*]] = zext <16 x i8> [[TMP37]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP39:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_5]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP41:%.*]] = sub nsw <16 x i16> [[TMP38]], [[TMP40]] +; CHECK-LTO-NEXT: [[TMP42:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP41]], i1 true) +; CHECK-LTO-NEXT: [[TMP43:%.*]] = zext nneg <16 x i16> [[TMP42]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP118:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP43]]) +; CHECK-LTO-NEXT: [[OP_RDX_6:%.*]] = add i32 [[OP_RDX_5]], [[TMP118]] +; CHECK-LTO-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_6:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_5]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP45:%.*]] = load <16 x i8>, ptr [[ADD_PTR_6]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP46:%.*]] = zext <16 x i8> [[TMP45]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP47:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_6]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP49:%.*]] = sub nsw <16 x i16> [[TMP46]], [[TMP48]] +; CHECK-LTO-NEXT: [[TMP50:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP49]], i1 true) +; CHECK-LTO-NEXT: [[TMP51:%.*]] = zext nneg <16 x i16> [[TMP50]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP120:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP51]]) +; CHECK-LTO-NEXT: [[OP_RDX_7:%.*]] = add i32 [[OP_RDX_6]], [[TMP120]] +; CHECK-LTO-NEXT: [[ADD_PTR_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_6]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_7:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_6]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP53:%.*]] = load <16 x i8>, ptr [[ADD_PTR_7]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP54:%.*]] = zext <16 x i8> [[TMP53]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP55:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_7]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP57:%.*]] = sub nsw <16 x i16> [[TMP54]], [[TMP56]] +; CHECK-LTO-NEXT: [[TMP58:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP57]], i1 true) +; CHECK-LTO-NEXT: [[TMP59:%.*]] = zext nneg <16 x i16> [[TMP58]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP121:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP59]]) +; CHECK-LTO-NEXT: [[OP_RDX_8:%.*]] = add i32 [[OP_RDX_7]], [[TMP121]] +; CHECK-LTO-NEXT: [[ADD_PTR_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_7]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_8:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_7]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP61:%.*]] = load <16 x i8>, ptr [[ADD_PTR_8]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP62:%.*]] = zext <16 x i8> [[TMP61]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP63:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_8]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP64:%.*]] = zext <16 x i8> [[TMP63]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP65:%.*]] = sub nsw <16 x i16> [[TMP62]], [[TMP64]] +; CHECK-LTO-NEXT: [[TMP66:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP65]], i1 true) +; CHECK-LTO-NEXT: [[TMP67:%.*]] = zext nneg <16 x i16> [[TMP66]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP122:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP67]]) +; CHECK-LTO-NEXT: [[OP_RDX_9:%.*]] = add i32 [[OP_RDX_8]], [[TMP122]] +; CHECK-LTO-NEXT: [[ADD_PTR_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_8]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_9:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_8]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP69:%.*]] = load <16 x i8>, ptr [[ADD_PTR_9]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP70:%.*]] = zext <16 x i8> [[TMP69]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP71:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_9]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP72:%.*]] = zext <16 x i8> [[TMP71]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP73:%.*]] = sub nsw <16 x i16> [[TMP70]], [[TMP72]] +; CHECK-LTO-NEXT: [[TMP74:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP73]], i1 true) +; CHECK-LTO-NEXT: [[TMP75:%.*]] = zext nneg <16 x i16> [[TMP74]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP123:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP75]]) +; CHECK-LTO-NEXT: [[OP_RDX_10:%.*]] = add i32 [[OP_RDX_9]], [[TMP123]] +; CHECK-LTO-NEXT: [[ADD_PTR_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_9]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_10:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_9]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP77:%.*]] = load <16 x i8>, ptr [[ADD_PTR_10]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP78:%.*]] = zext <16 x i8> [[TMP77]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP79:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_10]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP80:%.*]] = zext <16 x i8> [[TMP79]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP81:%.*]] = sub nsw <16 x i16> [[TMP78]], [[TMP80]] +; CHECK-LTO-NEXT: [[TMP82:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP81]], i1 true) +; CHECK-LTO-NEXT: [[TMP83:%.*]] = zext nneg <16 x i16> [[TMP82]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP124:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP83]]) +; CHECK-LTO-NEXT: [[OP_RDX_11:%.*]] = add i32 [[OP_RDX_10]], [[TMP124]] +; CHECK-LTO-NEXT: [[ADD_PTR_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_10]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_11:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_10]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP85:%.*]] = load <16 x i8>, ptr [[ADD_PTR_11]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP86:%.*]] = zext <16 x i8> [[TMP85]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP87:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_11]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP88:%.*]] = zext <16 x i8> [[TMP87]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP89:%.*]] = sub nsw <16 x i16> [[TMP86]], [[TMP88]] +; CHECK-LTO-NEXT: [[TMP90:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP89]], i1 true) +; CHECK-LTO-NEXT: [[TMP91:%.*]] = zext nneg <16 x i16> [[TMP90]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP125:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP91]]) +; CHECK-LTO-NEXT: [[OP_RDX_12:%.*]] = add i32 [[OP_RDX_11]], [[TMP125]] +; CHECK-LTO-NEXT: [[ADD_PTR_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_11]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_12:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_11]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP93:%.*]] = load <16 x i8>, ptr [[ADD_PTR_12]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP94:%.*]] = zext <16 x i8> [[TMP93]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP95:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_12]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP96:%.*]] = zext <16 x i8> [[TMP95]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP97:%.*]] = sub nsw <16 x i16> [[TMP94]], [[TMP96]] +; CHECK-LTO-NEXT: [[TMP98:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP97]], i1 true) +; CHECK-LTO-NEXT: [[TMP99:%.*]] = zext nneg <16 x i16> [[TMP98]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP126:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP99]]) +; CHECK-LTO-NEXT: [[OP_RDX_13:%.*]] = add i32 [[OP_RDX_12]], [[TMP126]] +; CHECK-LTO-NEXT: [[ADD_PTR_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_12]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_13:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_12]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP101:%.*]] = load <16 x i8>, ptr [[ADD_PTR_13]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP102:%.*]] = zext <16 x i8> [[TMP101]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP103:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_13]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP104:%.*]] = zext <16 x i8> [[TMP103]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP105:%.*]] = sub nsw <16 x i16> [[TMP102]], [[TMP104]] +; CHECK-LTO-NEXT: [[TMP106:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP105]], i1 true) +; CHECK-LTO-NEXT: [[TMP107:%.*]] = zext nneg <16 x i16> [[TMP106]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP119:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP107]]) +; CHECK-LTO-NEXT: [[OP_RDX_14:%.*]] = add i32 [[OP_RDX_13]], [[TMP119]] +; CHECK-LTO-NEXT: [[ADD_PTR_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_13]], i64 [[IDX_EXT]] +; CHECK-LTO-NEXT: [[ADD_PTR9_14:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR9_13]], i64 [[IDX_EXT8]] +; CHECK-LTO-NEXT: [[TMP109:%.*]] = load <16 x i8>, ptr [[ADD_PTR_14]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP110:%.*]] = zext <16 x i8> [[TMP109]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP111:%.*]] = load <16 x i8>, ptr [[ADD_PTR9_14]], align 1, !tbaa [[TBAA0]] +; CHECK-LTO-NEXT: [[TMP112:%.*]] = zext <16 x i8> [[TMP111]] to <16 x i16> +; CHECK-LTO-NEXT: [[TMP113:%.*]] = sub nsw <16 x i16> [[TMP110]], [[TMP112]] +; CHECK-LTO-NEXT: [[TMP114:%.*]] = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP113]], i1 true) +; CHECK-LTO-NEXT: [[TMP115:%.*]] = zext nneg <16 x i16> [[TMP114]] to <16 x i32> +; CHECK-LTO-NEXT: [[TMP127:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP115]]) +; CHECK-LTO-NEXT: [[OP_RDX_15:%.*]] = add i32 [[OP_RDX_14]], [[TMP127]] +; CHECK-LTO-NEXT: ret i32 [[OP_RDX_15]] +; +entry: + %p1.addr = alloca ptr, align 8 + %s_p1.addr = alloca i32, align 4 + %p2.addr = alloca ptr, align 8 + %s_p2.addr = alloca i32, align 4 + %i_sum = alloca i32, align 4 + %y = alloca i32, align 4 + %cleanup.dest.slot = alloca i32, align 4 + %x = alloca i32, align 4 + store ptr %p1, ptr %p1.addr, align 8, !tbaa !4 + store i32 %s_p1, ptr %s_p1.addr, align 4, !tbaa !9 + store ptr %p2, ptr %p2.addr, align 8, !tbaa !4 + store i32 %s_p2, ptr %s_p2.addr, align 4, !tbaa !9 + call void @llvm.lifetime.start.p0(i64 4, ptr %i_sum) #3 + store i32 0, ptr %i_sum, align 4, !tbaa !9 + call void @llvm.lifetime.start.p0(i64 4, ptr %y) #3 + store i32 0, ptr %y, align 4, !tbaa !9 + br label %for.cond + +for.cond: ; preds = %for.inc10, %entry + %0 = load i32, ptr %y, align 4, !tbaa !9 + %cmp = icmp slt i32 %0, 16 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + store i32 2, ptr %cleanup.dest.slot, align 4 + call void @llvm.lifetime.end.p0(i64 4, ptr %y) #3 + br label %for.end12 + +for.body: ; preds = %for.cond + call void @llvm.lifetime.start.p0(i64 4, ptr %x) #3 + store i32 0, ptr %x, align 4, !tbaa !9 + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.body + %1 = load i32, ptr %x, align 4, !tbaa !9 + %cmp2 = icmp slt i32 %1, 16 + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 + +for.cond.cleanup3: ; preds = %for.cond1 + store i32 5, ptr %cleanup.dest.slot, align 4 + call void @llvm.lifetime.end.p0(i64 4, ptr %x) #3 + br label %for.end + +for.body4: ; preds = %for.cond1 + %2 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %3 = load i32, ptr %x, align 4, !tbaa !9 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i8, ptr %2, i64 %idxprom + %4 = load i8, ptr %arrayidx, align 1, !tbaa !11 + %conv = zext i8 %4 to i32 + %5 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %6 = load i32, ptr %x, align 4, !tbaa !9 + %idxprom5 = sext i32 %6 to i64 + %arrayidx6 = getelementptr inbounds i8, ptr %5, i64 %idxprom5 + %7 = load i8, ptr %arrayidx6, align 1, !tbaa !11 + %conv7 = zext i8 %7 to i32 + %sub = sub nsw i32 %conv, %conv7 + %8 = call i32 @llvm.abs.i32(i32 %sub, i1 true) + %9 = load i32, ptr %i_sum, align 4, !tbaa !9 + %add = add nsw i32 %9, %8 + store i32 %add, ptr %i_sum, align 4, !tbaa !9 + br label %for.inc + +for.inc: ; preds = %for.body4 + %10 = load i32, ptr %x, align 4, !tbaa !9 + %inc = add nsw i32 %10, 1 + store i32 %inc, ptr %x, align 4, !tbaa !9 + br label %for.cond1, !llvm.loop !12 + +for.end: ; preds = %for.cond.cleanup3 + %11 = load i32, ptr %s_p1.addr, align 4, !tbaa !9 + %12 = load ptr, ptr %p1.addr, align 8, !tbaa !4 + %idx.ext = sext i32 %11 to i64 + %add.ptr = getelementptr inbounds i8, ptr %12, i64 %idx.ext + store ptr %add.ptr, ptr %p1.addr, align 8, !tbaa !4 + %13 = load i32, ptr %s_p2.addr, align 4, !tbaa !9 + %14 = load ptr, ptr %p2.addr, align 8, !tbaa !4 + %idx.ext8 = sext i32 %13 to i64 + %add.ptr9 = getelementptr inbounds i8, ptr %14, i64 %idx.ext8 + store ptr %add.ptr9, ptr %p2.addr, align 8, !tbaa !4 + br label %for.inc10 + +for.inc10: ; preds = %for.end + %15 = load i32, ptr %y, align 4, !tbaa !9 + %inc11 = add nsw i32 %15, 1 + store i32 %inc11, ptr %y, align 4, !tbaa !9 + br label %for.cond, !llvm.loop !14 + +for.end12: ; preds = %for.cond.cleanup + %16 = load i32, ptr %i_sum, align 4, !tbaa !9 + store i32 1, ptr %cleanup.dest.slot, align 4 + call void @llvm.lifetime.end.p0(i64 4, ptr %i_sum) #3 + ret i32 %16 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.abs.i32(i32, i1 immarg) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr) #1 + +attributes #0 = { nounwind uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+perfmon,+predres,+rand,+ras,+rcpc,+rdm,+sb,+spe,+ssbs,+sve,+sve-bitperm,+sve2,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind } + + +!4 = !{!5, !5, i64 0} +!5 = !{!"p1 omnipotent char", !6, i64 0} +!6 = !{!"any pointer", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = !{!10, !10, i64 0} +!10 = !{!"int", !7, i64 0} +!11 = !{!7, !7, i64 0} +!12 = distinct !{!12, !13} +!13 = !{!"llvm.loop.mustprogress"} +!14 = distinct !{!14, !13} +;. +; CHECK-O3: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK-O3: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0} +; CHECK-O3: [[META2]] = !{!"Simple C/C++ TBAA"} +;. +; CHECK-LTO: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK-LTO: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0} +; CHECK-LTO: [[META2]] = !{!"Simple C/C++ TBAA"} +;. diff --git a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll index bbd4849c32296..d5edf83ee52e2 100644 --- a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll +++ b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll @@ -12,7 +12,7 @@ entry: define ptr @parent(ptr align 8 dereferenceable(72) %f, half %val1, i16 %val2, i32 %val3) align 2 { ; CHECK-LABEL: define noundef nonnull ptr @parent -; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], half [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-SAME: (ptr readonly returned align 8 captures(ret: address, provenance) dereferenceable(72) [[F:%.*]], half [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[F]], i64 64 ; CHECK-NEXT: [[F_VAL:%.*]] = load ptr, ptr [[TMP0]], align 8 diff --git a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll index ee7698b116aa2..4b422f205138a 100644 --- a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll +++ b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll @@ -14,7 +14,7 @@ entry: define ptr @parent(ptr align 8 dereferenceable(72) %f, i16 %val1, i16 %val2, i32 %val3) align 2 { ; CHECK-LABEL: define noundef nonnull ptr @parent -; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-SAME: (ptr readonly returned align 8 captures(ret: address, provenance) dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[F]], i64 64 ; CHECK-NEXT: [[F_VAL:%.*]] = load ptr, ptr [[TMP0]], align 8 diff --git a/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll index 5f75bd788e4bb..cd2ed37b22db5 100644 --- a/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll +++ b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll @@ -11,7 +11,7 @@ define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr { ; NOROTATION-LABEL: define void @test( -; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { +; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 captures(address) [[START:%.*]], ptr readnone captures(address) [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { ; NOROTATION-NEXT: entry: ; NOROTATION-NEXT: br label [[LOOP_HEADER:%.*]] ; NOROTATION: loop.header: @@ -26,7 +26,7 @@ define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr { ; NOROTATION-NEXT: ret void ; ; ROTATION-LABEL: define void @test( -; ROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { +; ROTATION-SAME: ptr noalias nonnull writeonly align 1 captures(address) [[START:%.*]], ptr readnone captures(address) [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { ; ROTATION-NEXT: entry: ; ROTATION-NEXT: [[_12_I1:%.*]] = icmp eq ptr [[START]], [[END]] ; ROTATION-NEXT: br i1 [[_12_I1]], label [[EXIT:%.*]], label [[LOOP_LATCH_PREHEADER:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll new file mode 100644 index 0000000000000..5681fb7346124 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=systemz -mcpu=z15 < %s -slp-threshold=-100 | FileCheck %s + +define void @test(i32 %0, i64 %1, i32 %2, i32 %3, ptr %4) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]], ptr [[TMP4:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP56]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP73:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP98:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <60 x i32> , i32 [[TMP0]], i32 7 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <60 x i32> [[TMP11]], <60 x i32> poison, <60 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <60 x i32> poison, i32 [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <60 x i32> [[TMP13]], i32 [[TMP2]], i32 7 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <60 x i32> [[TMP14]], i32 [[TMP98]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <60 x i32> [[TMP15]], i32 [[TMP73]], i32 6 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <60 x i32> [[TMP16]], <60 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP17]], <2 x i32> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP18]], <2 x i32> [[TMP8]], i64 4) +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP19]], <8 x i32> poison, <60 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = xor <60 x i32> [[TMP12]], [[TMP20]] +; CHECK-NEXT: [[TMP130:%.*]] = call i32 @llvm.vector.reduce.or.v60i32(<60 x i32> [[TMP21]]) +; CHECK-NEXT: store i32 [[TMP130]], ptr [[TMP4]], align 4 +; CHECK-NEXT: ret void +; + %6 = trunc i64 %1 to i32 + %7 = xor i32 %6, 1 + %8 = xor i32 %6, 1 + %9 = or i32 %7, %8 + %10 = xor i32 %6, 1 + %11 = or i32 %9, %10 + %12 = xor i32 %6, 1 + %13 = or i32 %11, %12 + %14 = xor i32 %6, 1 + %15 = or i32 %13, %14 + %16 = xor i32 %6, 1 + %17 = or i32 %15, %16 + %18 = xor i32 %6, 1 + %19 = or i32 %17, %18 + %20 = xor i32 %6, 1 + %21 = or i32 %19, %20 + %22 = trunc i64 %1 to i32 + %23 = xor i32 %22, 1 + %24 = or i32 %23, %21 + %25 = xor i32 %22, 1 + %26 = or i32 %24, %25 + %27 = xor i32 %22, 1 + %28 = or i32 %26, %27 + %29 = xor i32 %22, 1 + %30 = or i32 %28, %29 + %31 = xor i32 %22, 1 + %32 = or i32 %30, %31 + %33 = xor i32 %22, 1 + %34 = or i32 %32, %33 + %35 = xor i32 %22, 1 + %36 = or i32 %34, %35 + %37 = xor i32 %22, 1 + %38 = or i32 %36, %37 + %39 = trunc i64 %1 to i32 + %40 = xor i32 %39, 1 + %41 = or i32 %40, %38 + %42 = xor i32 %39, 1 + %43 = or i32 %41, %42 + %44 = xor i32 %39, 1 + %45 = or i32 %43, %44 + %46 = xor i32 %39, 1 + %47 = or i32 %45, %46 + %48 = xor i32 %39, 1 + %49 = or i32 %47, %48 + %50 = xor i32 %39, 1 + %51 = or i32 %49, %50 + %52 = xor i32 %39, 1 + %53 = or i32 %51, %52 + %54 = xor i32 %39, 1 + %55 = or i32 %53, %54 + %56 = trunc i64 %1 to i32 + %57 = xor i32 %56, 1 + %58 = or i32 %57, %55 + %59 = xor i32 %56, 1 + %60 = or i32 %58, %59 + %61 = xor i32 %56, 1 + %62 = or i32 %60, %61 + %63 = xor i32 %56, 1 + %64 = or i32 %62, %63 + %65 = xor i32 %56, 1 + %66 = or i32 %64, %65 + %67 = xor i32 %56, 1 + %68 = or i32 %66, %67 + %69 = xor i32 %56, 1 + %70 = or i32 %68, %69 + %71 = xor i32 %56, 1 + %72 = or i32 %70, %71 + %73 = trunc i64 %1 to i32 + %74 = xor i32 %73, 1 + %75 = or i32 %74, %72 + %76 = xor i32 %73, 1 + %77 = or i32 %75, %76 + %78 = xor i32 %73, 1 + %79 = or i32 %77, %78 + %80 = xor i32 %73, 1 + %81 = or i32 %79, %80 + %82 = xor i32 %73, 1 + %83 = or i32 %81, %82 + %84 = xor i32 %73, 1 + %85 = or i32 %83, %84 + %86 = xor i32 %73, 1 + %87 = or i32 %85, %86 + %88 = xor i32 %0, %73 + %89 = or i32 %87, %88 + %90 = xor i32 %0, %2 + %91 = or i32 %90, %89 + %92 = xor i32 %0, %2 + %93 = or i32 %91, %92 + %94 = xor i32 %0, %2 + %95 = or i32 %93, %94 + %96 = xor i32 %0, %2 + %97 = or i32 %95, %96 + %98 = trunc i64 %1 to i32 + %99 = xor i32 %98, 1 + %100 = xor i32 %98, 1 + %101 = or i32 %99, %100 + %102 = xor i32 %98, 1 + %103 = or i32 %101, %102 + %104 = xor i32 %98, 1 + %105 = or i32 %103, %104 + %106 = xor i32 %98, 1 + %107 = or i32 %105, %106 + %108 = xor i32 %98, 1 + %109 = or i32 %107, %108 + %110 = xor i32 %98, 1 + %111 = or i32 %109, %110 + %112 = xor i32 %0, %98 + %113 = or i32 %111, %112 + %114 = xor i32 %0, %3 + %115 = or i32 %113, %114 + %116 = xor i32 %0, %3 + %117 = or i32 %115, %116 + %118 = xor i32 %0, %3 + %119 = or i32 %117, %118 + %120 = xor i32 %0, %3 + %121 = or i32 %119, %120 + %122 = xor i32 %0, %3 + %123 = or i32 %121, %122 + %124 = xor i32 %0, %3 + %125 = or i32 %123, %124 + %126 = xor i32 %0, %3 + %127 = or i32 %125, %126 + %128 = xor i32 %0, %3 + %129 = or i32 %127, %128 + %130 = or i32 %129, %97 + store i32 %130, ptr %4, align 4 + ret void +} + diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll index ee8592c04b62c..6b18d4069e0ae 100644 --- a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll +++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll @@ -77,7 +77,7 @@ define void @store_fadd_load(ptr %ptr) { ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0 ; CHECK-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 ; CHECK-NEXT: [[VECL1:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[VEC:%.*]] = fadd <2 x float> [[VECL]], [[VECL1]] +; CHECK-NEXT: [[VEC:%.*]] = fadd <2 x float> [[VECL1]], [[VECL]] ; CHECK-NEXT: store <2 x float> [[VEC]], ptr [[PTR0]], align 4 ; CHECK-NEXT: ret void ; @@ -243,12 +243,34 @@ define void @diamondWithShuffle(ptr %ptr) { ret void } +; Same but with <2 x float> elements instead of scalars. +define void @diamondWithShuffleFromVec(ptr %ptr) { +; CHECK-LABEL: define void @diamondWithShuffleFromVec( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr <2 x float>, ptr [[PTR]], i32 0 +; CHECK-NEXT: [[VECL:%.*]] = load <4 x float>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[VSHUF:%.*]] = shufflevector <4 x float> [[VECL]], <4 x float> [[VECL]], <4 x i32> +; CHECK-NEXT: [[VEC:%.*]] = fsub <4 x float> [[VECL]], [[VSHUF]] +; CHECK-NEXT: store <4 x float> [[VEC]], ptr [[PTR0]], align 8 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr <2 x float>, ptr %ptr, i32 0 + %ptr1 = getelementptr <2 x float>, ptr %ptr, i32 1 + %ld0 = load <2 x float>, ptr %ptr0 + %ld1 = load <2 x float>, ptr %ptr1 + %sub0 = fsub <2 x float> %ld0, %ld1 + %sub1 = fsub <2 x float> %ld1, %ld0 + store <2 x float> %sub0, ptr %ptr0 + store <2 x float> %sub1, ptr %ptr1 + ret void +} + define void @diamondMultiInput(ptr %ptr, ptr %ptrX) { ; CHECK-LABEL: define void @diamondMultiInput( ; CHECK-SAME: ptr [[PTR:%.*]], ptr [[PTRX:%.*]]) { ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0 -; CHECK-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 ; CHECK-NEXT: [[LDX:%.*]] = load float, ptr [[PTRX]], align 4 +; CHECK-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 ; CHECK-NEXT: [[VINS:%.*]] = insertelement <2 x float> poison, float [[LDX]], i32 0 ; CHECK-NEXT: [[VEXT:%.*]] = extractelement <2 x float> [[VECL]], i32 0 ; CHECK-NEXT: [[VINS1:%.*]] = insertelement <2 x float> [[VINS]], float [[VEXT]], i32 1 @@ -270,6 +292,39 @@ define void @diamondMultiInput(ptr %ptr, ptr %ptrX) { ret void } +; Same but vectorizing <2 x float> vectors instead of scalars. +define void @diamondMultiInputVector(ptr %ptr, ptr %ptrX) { +; CHECK-LABEL: define void @diamondMultiInputVector( +; CHECK-SAME: ptr [[PTR:%.*]], ptr [[PTRX:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr <2 x float>, ptr [[PTR]], i32 0 +; CHECK-NEXT: [[LDX:%.*]] = load <2 x float>, ptr [[PTRX]], align 8 +; CHECK-NEXT: [[VECL:%.*]] = load <4 x float>, ptr [[PTR0]], align 8 +; CHECK-NEXT: [[VEXT:%.*]] = extractelement <2 x float> [[LDX]], i32 0 +; CHECK-NEXT: [[INSI:%.*]] = insertelement <4 x float> poison, float [[VEXT]], i32 0 +; CHECK-NEXT: [[VEXT1:%.*]] = extractelement <2 x float> [[LDX]], i32 1 +; CHECK-NEXT: [[INSI2:%.*]] = insertelement <4 x float> [[INSI]], float [[VEXT1]], i32 1 +; CHECK-NEXT: [[VEXT3:%.*]] = extractelement <4 x float> [[VECL]], i32 0 +; CHECK-NEXT: [[VINS4:%.*]] = insertelement <4 x float> [[INSI2]], float [[VEXT3]], i32 2 +; CHECK-NEXT: [[VEXT4:%.*]] = extractelement <4 x float> [[VECL]], i32 1 +; CHECK-NEXT: [[VINS5:%.*]] = insertelement <4 x float> [[VINS4]], float [[VEXT4]], i32 3 +; CHECK-NEXT: [[VEC:%.*]] = fsub <4 x float> [[VECL]], [[VINS5]] +; CHECK-NEXT: store <4 x float> [[VEC]], ptr [[PTR0]], align 8 +; CHECK-NEXT: ret void +; + %ptr0 = getelementptr <2 x float>, ptr %ptr, i32 0 + %ptr1 = getelementptr <2 x float>, ptr %ptr, i32 1 + %ld0 = load <2 x float>, ptr %ptr0 + %ld1 = load <2 x float>, ptr %ptr1 + + %ldX = load <2 x float>, ptr %ptrX + + %sub0 = fsub <2 x float> %ld0, %ldX + %sub1 = fsub <2 x float> %ld1, %ld0 + store <2 x float> %sub0, ptr %ptr0 + store <2 x float> %sub1, ptr %ptr1 + ret void +} + define void @diamondWithConstantVector(ptr %ptr) { ; CHECK-LABEL: define void @diamondWithConstantVector( ; CHECK-SAME: ptr [[PTR:%.*]]) { diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll index f1c6e3297d79c..1b189831569f5 100644 --- a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll +++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll @@ -7,8 +7,8 @@ define void @pow2(ptr %ptr, float %val) { ; POW2-SAME: ptr [[PTR:%.*]], float [[VAL:%.*]]) { ; POW2-NEXT: [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0 ; POW2-NEXT: [[PTR2:%.*]] = getelementptr float, ptr [[PTR]], i32 2 -; POW2-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 ; POW2-NEXT: [[LD2:%.*]] = load float, ptr [[PTR2]], align 4 +; POW2-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 ; POW2-NEXT: store <2 x float> [[VECL]], ptr [[PTR0]], align 4 ; POW2-NEXT: store float [[LD2]], ptr [[PTR2]], align 4 ; POW2-NEXT: ret void diff --git a/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll b/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll index 25d9d79154d35..add762ac2d894 100644 --- a/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll +++ b/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll @@ -5,10 +5,10 @@ define i32 @repeated_splat(ptr %ptr, i32 %v) #0 { ; CHECK-LABEL: define i32 @repeated_splat( ; CHECK-SAME: ptr [[PTR:%.*]], i32 [[V:%.*]]) { ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 0 -; CHECK-NEXT: [[VECL:%.*]] = load <2 x i32>, ptr [[GEP0]], align 4 ; CHECK-NEXT: [[SPLAT:%.*]] = add i32 [[V]], 0 ; CHECK-NEXT: [[PACK:%.*]] = insertelement <2 x i32> poison, i32 [[SPLAT]], i32 0 ; CHECK-NEXT: [[PACK1:%.*]] = insertelement <2 x i32> [[PACK]], i32 [[SPLAT]], i32 1 +; CHECK-NEXT: [[VECL:%.*]] = load <2 x i32>, ptr [[GEP0]], align 4 ; CHECK-NEXT: [[VEC:%.*]] = mul <2 x i32> [[VECL]], [[PACK1]] ; CHECK-NEXT: store <2 x i32> [[VEC]], ptr [[GEP0]], align 4 ; CHECK-NEXT: ret i32 0 @@ -31,6 +31,7 @@ define i32 @repeated_partial(ptr %ptr, i32 %v) #0 { ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 1 ; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 3 +; CHECK-NEXT: [[SPLAT:%.*]] = add i32 [[V]], 0 ; CHECK-NEXT: [[LD0:%.*]] = load i32, ptr [[GEP0]], align 4 ; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[LD3:%.*]] = load i32, ptr [[GEP3]], align 4 @@ -39,7 +40,6 @@ define i32 @repeated_partial(ptr %ptr, i32 %v) #0 { ; CHECK-NEXT: [[PACK2:%.*]] = insertelement <4 x i32> [[PACK1]], i32 [[LD1]], i32 2 ; CHECK-NEXT: [[PACK3:%.*]] = insertelement <4 x i32> [[PACK2]], i32 [[LD3]], i32 3 ; CHECK-NEXT: [[VECL:%.*]] = load <4 x i32>, ptr [[GEP0]], align 4 -; CHECK-NEXT: [[SPLAT:%.*]] = add i32 [[V]], 0 ; CHECK-NEXT: [[VEC:%.*]] = mul <4 x i32> [[VECL]], [[PACK3]] ; CHECK-NEXT: store <4 x i32> [[VEC]], ptr [[GEP0]], align 4 ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SandboxVectorizer/scheduler.ll b/llvm/test/Transforms/SandboxVectorizer/scheduler.ll index 92a78a979192b..acbec80db6b06 100644 --- a/llvm/test/Transforms/SandboxVectorizer/scheduler.ll +++ b/llvm/test/Transforms/SandboxVectorizer/scheduler.ll @@ -49,3 +49,28 @@ define void @check_dag_scheduler_update(ptr noalias %p, ptr noalias %p1) { store i32 %add21, ptr %arrayidx23 ret void } + +; This used to generate use-before-def because of a buggy update of the +; top-of-schedule variable. +define <4 x float> @check_top_of_schedule(ptr %0) { +; CHECK-LABEL: define <4 x float> @check_top_of_schedule( +; CHECK-SAME: ptr [[TMP0:%.*]]) { +; CHECK-NEXT: [[INS_1:%.*]] = insertelement <4 x float> zeroinitializer, float poison, i64 0 +; CHECK-NEXT: [[TRUNC_1:%.*]] = fptrunc double 0.000000e+00 to float +; CHECK-NEXT: [[INS_2:%.*]] = insertelement <4 x float> [[INS_1]], float [[TRUNC_1]], i64 0 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr double, ptr [[TMP0]], i64 1 +; CHECK-NEXT: store <2 x double> , ptr [[GEP_1]], align 8 +; CHECK-NEXT: ret <4 x float> [[INS_2]] +; + %trunc.1 = fptrunc double 0.000000e+00 to float + %trunc.2 = fptrunc double 1.000000e+00 to float + %ins.1 = insertelement <4 x float> zeroinitializer, float poison, i64 0 + %ins.2 = insertelement <4 x float> %ins.1, float %trunc.1, i64 0 + %ext.1 = fpext float %trunc.1 to double + %gep.1 = getelementptr double, ptr %0, i64 1 + store double %ext.1, ptr %gep.1, align 8 + %ext.2 = fpext float %trunc.2 to double + %gep.2 = getelementptr double, ptr %0, i64 2 + store double %ext.2, ptr %gep.2, align 8 + ret <4 x float> %ins.2 +} diff --git a/llvm/test/Transforms/Util/strip-nonlinetable-debuginfo-pr125116.ll b/llvm/test/Transforms/Util/strip-nonlinetable-debuginfo-pr125116.ll new file mode 100644 index 0000000000000..dafee60a491e7 --- /dev/null +++ b/llvm/test/Transforms/Util/strip-nonlinetable-debuginfo-pr125116.ll @@ -0,0 +1,88 @@ +; Test if StripNonLineTableDebugInfo crashes or produces invalid IR, +; this test contains a slightly complex debug info structure, +; which may trigger the bug mentioned in pr#125116 +; +; RUN: opt < %s -p=strip-nonlinetable-debuginfo -S | FileCheck %s +; +; CHECK-NOT: DIBasicType +; CHECK-NOT: DIDerivedType +; CHECK-NOT: DICompositeType +; CHECK-NOT: DILocation(line: 604, column: 1, scope: null) + +define void @main() !dbg !34 { + ret void, !dbg !68 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!31, !32} +!llvm.ident = !{!33} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.7 (tags/RELEASE_370/final)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2) +!1 = !DIFile(filename: "no filename", directory: "") +!2 = !{} +!3 = !{!4, !22} +!4 = !DIDerivedType(tag: DW_TAG_typedef, name: "float3x3", file: !1, line: 361, baseType: !5) +!5 = !DICompositeType(tag: DW_TAG_class_type, name: "matrix", file: !1, line: 246, size: 288, align: 32, elements: !6, templateParams: !17) +!6 = !{!7, !9, !10, !11, !12, !13, !14, !15, !16} +!7 = !DIDerivedType(tag: DW_TAG_member, name: "_11", scope: !5, file: !1, line: 246, baseType: !8, size: 32, align: 32, flags: DIFlagPublic) +!8 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float) +!9 = !DIDerivedType(tag: DW_TAG_member, name: "_12", scope: !5, file: !1, line: 246, baseType: !8, size: 32, align: 32, offset: 32, flags: DIFlagPublic) +!10 = !DIDerivedType(tag: DW_TAG_member, name: "_13", scope: !5, file: !1, line: 246, baseType: !8, size: 32, align: 32, offset: 64, flags: DIFlagPublic) +!11 = !DIDerivedType(tag: DW_TAG_member, name: "_21", scope: !5, file: !1, line: 246, baseType: !8, size: 32, align: 32, offset: 96, flags: DIFlagPublic) +!12 = !DIDerivedType(tag: DW_TAG_member, name: "_22", scope: !5, file: !1, line: 246, baseType: !8, size: 32, align: 32, offset: 128, flags: DIFlagPublic) +!13 = !DIDerivedType(tag: DW_TAG_member, name: "_23", scope: !5, file: !1, line: 246, baseType: !8, size: 32, align: 32, offset: 160, flags: DIFlagPublic) +!14 = !DIDerivedType(tag: DW_TAG_member, name: "_31", scope: !5, file: !1, line: 246, baseType: !8, size: 32, align: 32, offset: 192, flags: DIFlagPublic) +!15 = !DIDerivedType(tag: DW_TAG_member, name: "_32", scope: !5, file: !1, line: 246, baseType: !8, size: 32, align: 32, offset: 224, flags: DIFlagPublic) +!16 = !DIDerivedType(tag: DW_TAG_member, name: "_33", scope: !5, file: !1, line: 246, baseType: !8, size: 32, align: 32, offset: 256, flags: DIFlagPublic) +!17 = !{!18, !19, !21} +!18 = !DITemplateTypeParameter(name: "element", type: !8) +!19 = !DITemplateValueParameter(name: "row_count", type: !20, value: i32 3) +!20 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!21 = !DITemplateValueParameter(name: "col_count", type: !20, value: i32 3) +!22 = !DIDerivedType(tag: DW_TAG_typedef, name: "float4", file: !1, baseType: !23) +!23 = !DICompositeType(tag: DW_TAG_class_type, name: "vector", file: !1, size: 128, align: 32, elements: !24, templateParams: !29) +!24 = !{!25, !26, !27, !28} +!25 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !23, file: !1, baseType: !8, size: 32, align: 32, flags: DIFlagPublic) +!26 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !23, file: !1, baseType: !8, size: 32, align: 32, offset: 32, flags: DIFlagPublic) +!27 = !DIDerivedType(tag: DW_TAG_member, name: "z", scope: !23, file: !1, baseType: !8, size: 32, align: 32, offset: 64, flags: DIFlagPublic) +!28 = !DIDerivedType(tag: DW_TAG_member, name: "w", scope: !23, file: !1, baseType: !8, size: 32, align: 32, offset: 96, flags: DIFlagPublic) +!29 = !{!18, !30} +!30 = !DITemplateValueParameter(name: "element_count", type: !20, value: i32 4) +!31 = !{i32 2, !"Dwarf Version", i32 4} +!32 = !{i32 2, !"Debug Info Version", i32 3} +!33 = !{!"clang version 3.7 (tags/RELEASE_370/final)"} +!34 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 581, type: !35, scopeLine: 582, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0) +!35 = !DISubroutineType(types: !36) +!36 = !{null, !37, !58} +!37 = !DICompositeType(tag: DW_TAG_structure_type, name: "VertexInput", file: !1, line: 254, size: 416, align: 32, elements: !38) +!38 = !{!39, !40, !48, !57} +!39 = !DIDerivedType(tag: DW_TAG_member, name: "Position", scope: !37, file: !1, line: 256, baseType: !22, size: 128, align: 32) +!40 = !DIDerivedType(tag: DW_TAG_member, name: "TexCoord", scope: !37, file: !1, line: 257, baseType: !41, size: 64, align: 32, offset: 128) +!41 = !DIDerivedType(tag: DW_TAG_typedef, name: "float2", file: !1, baseType: !42) +!42 = !DICompositeType(tag: DW_TAG_class_type, name: "vector", file: !1, size: 64, align: 32, elements: !43, templateParams: !46) +!43 = !{!44, !45} +!44 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !42, file: !1, baseType: !8, size: 32, align: 32, flags: DIFlagPublic) +!45 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !42, file: !1, baseType: !8, size: 32, align: 32, offset: 32, flags: DIFlagPublic) +!46 = !{!18, !47} +!47 = !DITemplateValueParameter(name: "element_count", type: !20, value: i32 2) +!48 = !DIDerivedType(tag: DW_TAG_member, name: "Normal", scope: !37, file: !1, line: 258, baseType: !49, size: 96, align: 32, offset: 192) +!49 = !DIDerivedType(tag: DW_TAG_typedef, name: "float3", file: !1, baseType: !50) +!50 = !DICompositeType(tag: DW_TAG_class_type, name: "vector", file: !1, size: 96, align: 32, elements: !51, templateParams: !55) +!51 = !{!52, !53, !54} +!52 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !50, file: !1, baseType: !8, size: 32, align: 32, flags: DIFlagPublic) +!53 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !50, file: !1, baseType: !8, size: 32, align: 32, offset: 32, flags: DIFlagPublic) +!54 = !DIDerivedType(tag: DW_TAG_member, name: "z", scope: !50, file: !1, baseType: !8, size: 32, align: 32, offset: 64, flags: DIFlagPublic) +!55 = !{!18, !56} +!56 = !DITemplateValueParameter(name: "element_count", type: !20, value: i32 3) +!57 = !DIDerivedType(tag: DW_TAG_member, name: "Tangent", scope: !37, file: !1, line: 259, baseType: !22, size: 128, align: 32, offset: 288) +!58 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !59) +!59 = !DICompositeType(tag: DW_TAG_structure_type, name: "VertexOutput", file: !1, line: 269, size: 672, align: 32, elements: !60) +!60 = !{!61, !62, !63, !64, !65, !66, !67} +!61 = !DIDerivedType(tag: DW_TAG_member, name: "Position", scope: !59, file: !1, line: 271, baseType: !22, size: 128, align: 32) +!62 = !DIDerivedType(tag: DW_TAG_member, name: "TexCoord", scope: !59, file: !1, line: 272, baseType: !41, size: 64, align: 32, offset: 128) +!63 = !DIDerivedType(tag: DW_TAG_member, name: "TangentInView", scope: !59, file: !1, line: 273, baseType: !49, size: 96, align: 32, offset: 192) +!64 = !DIDerivedType(tag: DW_TAG_member, name: "BitangentInView", scope: !59, file: !1, line: 274, baseType: !49, size: 96, align: 32, offset: 288) +!65 = !DIDerivedType(tag: DW_TAG_member, name: "NormalInView", scope: !59, file: !1, line: 275, baseType: !49, size: 96, align: 32, offset: 384) +!66 = !DIDerivedType(tag: DW_TAG_member, name: "EyeDirectionInView", scope: !59, file: !1, line: 276, baseType: !49, size: 96, align: 32, offset: 480) +!67 = !DIDerivedType(tag: DW_TAG_member, name: "PositionInView", scope: !59, file: !1, line: 277, baseType: !49, size: 96, align: 32, offset: 576) +!68 = !DILocation(line: 604, column: 1, scope: !34) diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s index 750809128189f..3cd7a0503e301 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s @@ -13,7 +13,7 @@ ; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 ; CHECK-NEXT: .amdhsa_kernarg_size 0 ; CHECK-NEXT: ; SHARED_VGPR_COUNT 0 -; CHECK-NEXT: ; INST_PREF_SIZE 0 +; CHECK-NEXT: .amdhsa_inst_pref_size 0 ; CHECK-NEXT: ; TRAP_ON_START 0 ; CHECK-NEXT: ; TRAP_ON_END 0 ; CHECK-NEXT: ; IMAGE_OP 0 @@ -70,7 +70,7 @@ ; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 ; CHECK-NEXT: .amdhsa_kernarg_size 0 ; CHECK-NEXT: .amdhsa_shared_vgpr_count 0 -; CHECK-NEXT: ; INST_PREF_SIZE 0 +; CHECK-NEXT: .amdhsa_inst_pref_size 0 ; CHECK-NEXT: ; TRAP_ON_START 0 ; CHECK-NEXT: ; TRAP_ON_END 0 ; CHECK-NEXT: ; IMAGE_OP 0 @@ -114,6 +114,7 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .amdhsa_shared_vgpr_count 0 + .amdhsa_inst_pref_size 0 .end_amdhsa_kernel ;--- 3.s @@ -127,7 +128,7 @@ ; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 ; CHECK-NEXT: .amdhsa_kernarg_size 0 ; CHECK-NEXT: .amdhsa_shared_vgpr_count 1 -; CHECK-NEXT: ; INST_PREF_SIZE 0 +; CHECK-NEXT: .amdhsa_inst_pref_size 63 ; CHECK-NEXT: ; TRAP_ON_START 0 ; CHECK-NEXT: ; TRAP_ON_END 0 ; CHECK-NEXT: ; IMAGE_OP 0 @@ -171,6 +172,7 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .amdhsa_shared_vgpr_count 1 + .amdhsa_inst_pref_size 63 .end_amdhsa_kernel ;--- 4.s @@ -184,7 +186,7 @@ ; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 ; CHECK-NEXT: .amdhsa_kernarg_size 0 ; CHECK-NEXT: .amdhsa_shared_vgpr_count 1 -; CHECK-NEXT: ; INST_PREF_SIZE 0 +; CHECK-NEXT: .amdhsa_inst_pref_size 63 ; CHECK-NEXT: ; TRAP_ON_START 0 ; CHECK-NEXT: ; TRAP_ON_END 0 ; CHECK-NEXT: ; IMAGE_OP 0 @@ -228,5 +230,6 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .amdhsa_shared_vgpr_count 1 + .amdhsa_inst_pref_size 63 .amdhsa_wavefront_size32 0 .end_amdhsa_kernel diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s index c644e15efc8d7..ed2b87d9885c6 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s @@ -12,7 +12,7 @@ ; CHECK-NEXT: .amdhsa_group_segment_fixed_size 0 ; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 ; CHECK-NEXT: .amdhsa_kernarg_size 0 -; CHECK-NEXT: ; INST_PREF_SIZE 0 +; CHECK-NEXT: .amdhsa_inst_pref_size 0 ; CHECK-NEXT: ; GLG_EN 0 ; CHECK-NEXT: ; IMAGE_OP 0 ; CHECK-NEXT: .amdhsa_next_free_vgpr 32 @@ -66,7 +66,7 @@ ; CHECK-NEXT: .amdhsa_group_segment_fixed_size 0 ; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0 ; CHECK-NEXT: .amdhsa_kernarg_size 0 -; CHECK-NEXT: ; INST_PREF_SIZE 0 +; CHECK-NEXT: .amdhsa_inst_pref_size 255 ; CHECK-NEXT: ; GLG_EN 0 ; CHECK-NEXT: ; IMAGE_OP 0 ; CHECK-NEXT: .amdhsa_next_free_vgpr 32 @@ -108,4 +108,5 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .amdhsa_wavefront_size32 0 + .amdhsa_inst_pref_size 255 .end_amdhsa_kernel diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp index dbdec77327774..8553eb70ebe49 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp @@ -18,6 +18,7 @@ #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h" #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h" #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleRemoteEPCServer.h" +#include "llvm/ExecutionEngine/Orc/TargetProcess/UnwindInfoManager.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/DynamicLibrary.h" @@ -190,6 +191,10 @@ int main(int argc, char *argv[]) { SimpleRemoteEPCServer::defaultBootstrapSymbols(); addDefaultBootstrapValuesForHostProcess(S.bootstrapMap(), S.bootstrapSymbols()); +#ifdef __APPLE__ + if (UnwindInfoManager::TryEnable()) + UnwindInfoManager::addBootstrapSymbols(S.bootstrapSymbols()); +#endif // __APPLE__ S.services().push_back( std::make_unique()); S.services().push_back( diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index a7ab7554902f8..9e6d3df297fc7 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -44,6 +44,7 @@ #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h" #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.h" #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h" +#include "llvm/ExecutionEngine/Orc/UnwindInfoRegistrationPlugin.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" @@ -1204,6 +1205,19 @@ Session::Session(std::unique_ptr EPC, Error &Err) inconvertibleErrorCode()); return; } + } else if (TT.isOSBinFormatMachO()) { + if (!NoExec) { + std::optional ForceEHFrames; + if ((Err = ES.getBootstrapMapValue("darwin-use-ehframes-only", + ForceEHFrames))) + return; + bool UseEHFrames = ForceEHFrames ? *ForceEHFrames : false; + if (!UseEHFrames) + ObjLayer.addPlugin(ExitOnErr(UnwindInfoRegistrationPlugin::Create(ES))); + else + ObjLayer.addPlugin(std::make_unique( + ES, ExitOnErr(EPCEHFrameRegistrar::Create(ES)))); + } } else if (TT.isOSBinFormatELF()) { if (!NoExec) ObjLayer.addPlugin(std::make_unique( diff --git a/llvm/unittests/Analysis/CaptureTrackingTest.cpp b/llvm/unittests/Analysis/CaptureTrackingTest.cpp index 73dd82fb921f7..3f5c10d935167 100644 --- a/llvm/unittests/Analysis/CaptureTrackingTest.cpp +++ b/llvm/unittests/Analysis/CaptureTrackingTest.cpp @@ -77,9 +77,9 @@ TEST(CaptureTracking, MaxUsesToExplore) { struct CollectingCaptureTracker : public CaptureTracker { SmallVector Captures; void tooManyUses() override { } - bool captured(const Use *U) override { + Action captured(const Use *U, UseCaptureInfo CI) override { Captures.push_back(U); - return false; + return Continue; } }; diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt index 963cdcc0275e1..4f580e7539f4d 100644 --- a/llvm/unittests/CodeGen/CMakeLists.txt +++ b/llvm/unittests/CodeGen/CMakeLists.txt @@ -27,6 +27,7 @@ add_llvm_unittest(CodeGenTests CCStateTest.cpp DIEHashTest.cpp DIETest.cpp + DroppedVariableStatsMIRTest.cpp DwarfStringPoolEntryRefTest.cpp InstrRefLDVTest.cpp LowLevelTypeTest.cpp diff --git a/llvm/unittests/CodeGen/DroppedVariableStatsMIRTest.cpp b/llvm/unittests/CodeGen/DroppedVariableStatsMIRTest.cpp new file mode 100644 index 0000000000000..157060ec4eebe --- /dev/null +++ b/llvm/unittests/CodeGen/DroppedVariableStatsMIRTest.cpp @@ -0,0 +1,1081 @@ +//===- unittests/CodeGen/DroppedVariableStatsMIRTest.cpp ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/DroppedVariableStatsMIR.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/CodeGen/MIRParser/MIRParser.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Pass.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "gtest/gtest.h" +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +namespace { + +std::unique_ptr +createTargetMachine(std::string TT, StringRef CPU, StringRef FS) { + std::string Error; + const Target *T = TargetRegistry::lookupTarget(TT, Error); + if (!T) + return nullptr; + TargetOptions Options; + return std::unique_ptr( + static_cast(T->createTargetMachine( + TT, CPU, FS, Options, std::nullopt, std::nullopt))); +} + +std::unique_ptr parseMIR(const TargetMachine &TM, StringRef MIRCode, + MachineModuleInfo &MMI, LLVMContext *Context) { + SMDiagnostic Diagnostic; + std::unique_ptr M; + std::unique_ptr MBuffer = MemoryBuffer::getMemBuffer(MIRCode); + auto MIR = createMIRParser(std::move(MBuffer), *Context); + if (!MIR) + return nullptr; + + std::unique_ptr Mod = MIR->parseIRModule(); + if (!Mod) + return nullptr; + + Mod->setDataLayout(TM.createDataLayout()); + + if (MIR->parseMachineFunctions(*Mod, MMI)) { + M.reset(); + return nullptr; + } + return Mod; +} +// This test ensures that if a DBG_VALUE and an instruction that exists in the +// same scope as that DBG_VALUE are both deleted as a result of an optimization +// pass, debug information is considered not dropped. +TEST(DroppedVariableStatsMIR, BothDeleted) { + InitializeAllTargetInfos(); + InitializeAllTargets(); + InitializeAllTargetMCs(); + PassInstrumentationCallbacks PIC; + PassInstrumentation PI(&PIC); + + LLVMContext C; + + const char *MIR = + R"( +--- | + ; ModuleID = '/tmp/test.ll' + source_filename = "/tmp/test.ll" + target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" + + define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 { + entry: + #dbg_value(i32 %x, !10, !DIExpression(), !11) + %add = add nsw i32 %x, 1, !dbg !12 + ret i32 0 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2} + !llvm.ident = !{!3} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/") + !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/") + !2 = !{i32 2, !"Debug Info Version", i32 3} + !3 = !{!"clang"} + !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9) + !5 = !DIFile(filename: "/tmp/code.cpp", directory: "") + !6 = !DISubroutineType(types: !7) + !7 = !{!8, !8} + !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !9 = !{!10} + !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8) + !11 = !DILocation(line: 0, scope: !4) + !12 = !DILocation(line: 2, column: 11, scope: !4) + +... +--- +name: _Z3fooi +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: _, preferred-register: '', flags: [ ] } + - { id: 1, class: _, preferred-register: '', flags: [ ] } + - { id: 2, class: _, preferred-register: '', flags: [ ] } + - { id: 3, class: _, preferred-register: '', flags: [ ] } +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $w0 + + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 1 + %3:_(s32) = G_CONSTANT i32 0 + DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11 + %2:_(s32) = nsw G_ADD %0, %1, debug-location !12 + $w0 = COPY %3(s32) + RET_ReallyLR implicit $w0 + )"; + auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", ""); + if (!TM) + return; + MachineModuleInfo MMI(TM.get()); + std::unique_ptr M = parseMIR(*TM, MIR, MMI, &C); + ASSERT_TRUE(M); + + DroppedVariableStatsMIR Stats; + auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi")); + Stats.runBeforePass("Test", MF); + + // This loop simulates an IR pass that drops debug information. + for (auto &MBB : *MF) { + for (auto &MI : MBB) { + if (MI.isDebugValueLike()) { + MI.eraseFromParent(); + break; + } + } + for (auto &MI : MBB) { + auto *DbgLoc = MI.getDebugLoc().get(); + if (DbgLoc) { + MI.eraseFromParent(); + break; + } + } + break; + } + + Stats.runAfterPass("Test", MF); + ASSERT_EQ(Stats.getPassDroppedVariables(), false); +} + +// This test ensures that if a DBG_VALUE is dropped after an optimization pass, +// but an instruction that shares the same scope as the DBG_VALUE still exists, +// debug information is conisdered dropped. +TEST(DroppedVariableStatsMIR, DbgValLost) { + InitializeAllTargetInfos(); + InitializeAllTargets(); + InitializeAllTargetMCs(); + PassInstrumentationCallbacks PIC; + PassInstrumentation PI(&PIC); + + LLVMContext C; + + const char *MIR = + R"( +--- | + ; ModuleID = '/tmp/test.ll' + source_filename = "/tmp/test.ll" + target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" + + define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 { + entry: + #dbg_value(i32 %x, !10, !DIExpression(), !11) + %add = add nsw i32 %x, 1, !dbg !12 + ret i32 0 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2} + !llvm.ident = !{!3} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/") + !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/") + !2 = !{i32 2, !"Debug Info Version", i32 3} + !3 = !{!"clang"} + !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9) + !5 = !DIFile(filename: "/tmp/code.cpp", directory: "") + !6 = !DISubroutineType(types: !7) + !7 = !{!8, !8} + !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !9 = !{!10} + !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8) + !11 = !DILocation(line: 0, scope: !4) + !12 = !DILocation(line: 2, column: 11, scope: !4) + +... +--- +name: _Z3fooi +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: _, preferred-register: '', flags: [ ] } + - { id: 1, class: _, preferred-register: '', flags: [ ] } + - { id: 2, class: _, preferred-register: '', flags: [ ] } + - { id: 3, class: _, preferred-register: '', flags: [ ] } +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $w0 + + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 1 + %3:_(s32) = G_CONSTANT i32 0 + DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11 + %2:_(s32) = nsw G_ADD %0, %1, debug-location !12 + $w0 = COPY %3(s32) + RET_ReallyLR implicit $w0 + )"; + auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", ""); + if (!TM) + return; + MachineModuleInfo MMI(TM.get()); + std::unique_ptr M = parseMIR(*TM, MIR, MMI, &C); + ASSERT_TRUE(M); + + DroppedVariableStatsMIR Stats; + auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi")); + Stats.runBeforePass("Test", MF); + + // This loop simulates an IR pass that drops debug information. + for (auto &MBB : *MF) { + for (auto &MI : MBB) { + if (MI.isDebugValueLike()) { + MI.eraseFromParent(); + break; + } + } + break; + } + + Stats.runAfterPass("Test", MF); + ASSERT_EQ(Stats.getPassDroppedVariables(), true); +} + +// This test ensures that if a #dbg_value is dropped after an optimization pass, +// but an instruction that has an unrelated scope as the #dbg_value still +// exists, debug information is conisdered not dropped. +TEST(DroppedVariableStatsMIR, UnrelatedScopes) { + InitializeAllTargetInfos(); + InitializeAllTargets(); + InitializeAllTargetMCs(); + PassInstrumentationCallbacks PIC; + PassInstrumentation PI(&PIC); + + LLVMContext C; + + const char *MIR = + R"( +--- | + ; ModuleID = '/tmp/test.ll' + source_filename = "/tmp/test.ll" + target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" + + define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 { + entry: + #dbg_value(i32 %x, !10, !DIExpression(), !11) + %add = add nsw i32 %x, 1, !dbg !12 + ret i32 0 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2} + !llvm.ident = !{!3} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/") + !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/") + !2 = !{i32 2, !"Debug Info Version", i32 3} + !3 = !{!"clang"} + !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9) + !5 = !DIFile(filename: "/tmp/code.cpp", directory: "") + !6 = !DISubroutineType(types: !7) + !7 = !{!8, !8} + !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !9 = !{!10} + !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8) + !11 = !DILocation(line: 0, scope: !4) + !12 = !DILocation(line: 2, column: 11, scope: !13) + !13 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9) + +... +--- +name: _Z3fooi +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: _, preferred-register: '', flags: [ ] } + - { id: 1, class: _, preferred-register: '', flags: [ ] } + - { id: 2, class: _, preferred-register: '', flags: [ ] } + - { id: 3, class: _, preferred-register: '', flags: [ ] } +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $w0 + + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 1 + %3:_(s32) = G_CONSTANT i32 0 + DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11 + %2:_(s32) = nsw G_ADD %0, %1, debug-location !12 + $w0 = COPY %3(s32) + RET_ReallyLR implicit $w0 + )"; + auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", ""); + if (!TM) + return; + MachineModuleInfo MMI(TM.get()); + std::unique_ptr M = parseMIR(*TM, MIR, MMI, &C); + ASSERT_TRUE(M); + + DroppedVariableStatsMIR Stats; + auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi")); + Stats.runBeforePass("Test", MF); + + // This loop simulates an IR pass that drops debug information. + for (auto &MBB : *MF) { + for (auto &MI : MBB) { + if (MI.isDebugValueLike()) { + MI.eraseFromParent(); + break; + } + } + break; + } + + Stats.runAfterPass("Test", MF); + ASSERT_EQ(Stats.getPassDroppedVariables(), false); +} + +// This test ensures that if a #dbg_value is dropped after an optimization pass, +// but an instruction that has a scope which is a child of the #dbg_value scope +// still exists, debug information is conisdered dropped. +TEST(DroppedVariableStatsMIR, ChildScopes) { + InitializeAllTargetInfos(); + InitializeAllTargets(); + InitializeAllTargetMCs(); + PassInstrumentationCallbacks PIC; + PassInstrumentation PI(&PIC); + + LLVMContext C; + + const char *MIR = + R"( +--- | + ; ModuleID = '/tmp/test.ll' + source_filename = "/tmp/test.ll" + target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" + + define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 { + entry: + #dbg_value(i32 %x, !10, !DIExpression(), !11) + %add = add nsw i32 %x, 1, !dbg !12 + ret i32 0 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2} + !llvm.ident = !{!3} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/") + !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/") + !2 = !{i32 2, !"Debug Info Version", i32 3} + !3 = !{!"clang"} + !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9) + !5 = !DIFile(filename: "/tmp/code.cpp", directory: "") + !6 = !DISubroutineType(types: !7) + !7 = !{!8, !8} + !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !9 = !{!10} + !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8) + !11 = !DILocation(line: 0, scope: !4) + !12 = !DILocation(line: 2, column: 11, scope: !13) + !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28) + +... +--- +name: _Z3fooi +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: _, preferred-register: '', flags: [ ] } + - { id: 1, class: _, preferred-register: '', flags: [ ] } + - { id: 2, class: _, preferred-register: '', flags: [ ] } + - { id: 3, class: _, preferred-register: '', flags: [ ] } +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $w0 + + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 1 + %3:_(s32) = G_CONSTANT i32 0 + DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11 + %2:_(s32) = nsw G_ADD %0, %1, debug-location !12 + $w0 = COPY %3(s32) + RET_ReallyLR implicit $w0 + )"; + auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", ""); + if (!TM) + return; + MachineModuleInfo MMI(TM.get()); + std::unique_ptr M = parseMIR(*TM, MIR, MMI, &C); + ASSERT_TRUE(M); + + DroppedVariableStatsMIR Stats; + auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi")); + Stats.runBeforePass("Test", MF); + + // This loop simulates an IR pass that drops debug information. + for (auto &MBB : *MF) { + for (auto &MI : MBB) { + if (MI.isDebugValueLike()) { + MI.eraseFromParent(); + break; + } + } + break; + } + + Stats.runAfterPass("Test", MF); + ASSERT_EQ(Stats.getPassDroppedVariables(), true); +} + +// This test ensures that if a DBG_VALUE is dropped after an optimization pass, +// but an instruction that has a scope which is a child of the DBG_VALUE scope +// still exists, and the DBG_VALUE is inlined at another location, debug +// information is conisdered not dropped. +TEST(DroppedVariableStatsMIR, InlinedAt) { + InitializeAllTargetInfos(); + InitializeAllTargets(); + InitializeAllTargetMCs(); + PassInstrumentationCallbacks PIC; + PassInstrumentation PI(&PIC); + + LLVMContext C; + + const char *MIR = + R"( +--- | + ; ModuleID = '/tmp/test.ll' + source_filename = "/tmp/test.ll" + target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" + + define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 { + entry: + #dbg_value(i32 %x, !10, !DIExpression(), !11) + %add = add nsw i32 %x, 1, !dbg !12 + ret i32 0 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2} + !llvm.ident = !{!3} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/") + !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/") + !2 = !{i32 2, !"Debug Info Version", i32 3} + !3 = !{!"clang"} + !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9) + !5 = !DIFile(filename: "/tmp/code.cpp", directory: "") + !6 = !DISubroutineType(types: !7) + !7 = !{!8, !8} + !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !9 = !{!10} + !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8) + !11 = !DILocation(line: 0, scope: !4, inlinedAt: !14) + !12 = !DILocation(line: 2, column: 11, scope: !13) + !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28) + !14 = !DILocation(line: 3, column: 2, scope: !4) + +... +--- +name: _Z3fooi +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: _, preferred-register: '', flags: [ ] } + - { id: 1, class: _, preferred-register: '', flags: [ ] } + - { id: 2, class: _, preferred-register: '', flags: [ ] } + - { id: 3, class: _, preferred-register: '', flags: [ ] } +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $w0 + + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 1 + %3:_(s32) = G_CONSTANT i32 0 + DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11 + %2:_(s32) = nsw G_ADD %0, %1, debug-location !12 + $w0 = COPY %3(s32) + RET_ReallyLR implicit $w0 + )"; + auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", ""); + if (!TM) + return; + MachineModuleInfo MMI(TM.get()); + std::unique_ptr M = parseMIR(*TM, MIR, MMI, &C); + ASSERT_TRUE(M); + + DroppedVariableStatsMIR Stats; + auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi")); + Stats.runBeforePass("Test", MF); + + // This loop simulates an IR pass that drops debug information. + for (auto &MBB : *MF) { + for (auto &MI : MBB) { + if (MI.isDebugValueLike()) { + MI.eraseFromParent(); + break; + } + } + break; + } + + Stats.runAfterPass("Test", MF); + ASSERT_EQ(Stats.getPassDroppedVariables(), false); +} + +// This test ensures that if a DBG_VALUE is dropped after an optimization pass, +// but an instruction that has a scope which is a child of the DBG_VALUE scope +// still exists, and the DBG_VALUE and the instruction are inlined at another +// location, debug information is conisdered dropped. +TEST(DroppedVariableStatsMIR, InlinedAtShared) { + InitializeAllTargetInfos(); + InitializeAllTargets(); + InitializeAllTargetMCs(); + PassInstrumentationCallbacks PIC; + PassInstrumentation PI(&PIC); + + LLVMContext C; + + const char *MIR = + R"( +--- | + ; ModuleID = '/tmp/test.ll' + source_filename = "/tmp/test.ll" + target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" + + define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 { + entry: + #dbg_value(i32 %x, !10, !DIExpression(), !11) + %add = add nsw i32 %x, 1, !dbg !12 + ret i32 0 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2} + !llvm.ident = !{!3} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/") + !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/") + !2 = !{i32 2, !"Debug Info Version", i32 3} + !3 = !{!"clang"} + !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9) + !5 = !DIFile(filename: "/tmp/code.cpp", directory: "") + !6 = !DISubroutineType(types: !7) + !7 = !{!8, !8} + !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !9 = !{!10} + !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8) + !11 = !DILocation(line: 0, scope: !4, inlinedAt: !14) + !12 = !DILocation(line: 2, column: 11, scope: !13, inlinedAt: !14) + !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28) + !14 = !DILocation(line: 3, column: 2, scope: !4) + +... +--- +name: _Z3fooi +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: _, preferred-register: '', flags: [ ] } + - { id: 1, class: _, preferred-register: '', flags: [ ] } + - { id: 2, class: _, preferred-register: '', flags: [ ] } + - { id: 3, class: _, preferred-register: '', flags: [ ] } +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $w0 + + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 1 + %3:_(s32) = G_CONSTANT i32 0 + DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11 + %2:_(s32) = nsw G_ADD %0, %1, debug-location !12 + $w0 = COPY %3(s32) + RET_ReallyLR implicit $w0 + )"; + auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", ""); + if (!TM) + return; + MachineModuleInfo MMI(TM.get()); + std::unique_ptr M = parseMIR(*TM, MIR, MMI, &C); + ASSERT_TRUE(M); + + DroppedVariableStatsMIR Stats; + auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi")); + Stats.runBeforePass("Test", MF); + + // This loop simulates an IR pass that drops debug information. + for (auto &MBB : *MF) { + for (auto &MI : MBB) { + if (MI.isDebugValueLike()) { + MI.eraseFromParent(); + break; + } + } + break; + } + + Stats.runAfterPass("Test", MF); + ASSERT_EQ(Stats.getPassDroppedVariables(), true); +} + +// This test ensures that if a DBG_VALUE is dropped after an optimization pass, +// but an instruction that has a scope which is a child of the DBG_VALUE scope +// still exists, and the instruction is inlined at a location that is the +// DBG_VALUE's inlined at location, debug information is conisdered dropped. +TEST(DroppedVariableStatsMIR, InlinedAtChild) { + InitializeAllTargetInfos(); + InitializeAllTargets(); + InitializeAllTargetMCs(); + PassInstrumentationCallbacks PIC; + PassInstrumentation PI(&PIC); + + LLVMContext C; + + const char *MIR = + R"( +--- | + ; ModuleID = '/tmp/test.ll' + source_filename = "/tmp/test.ll" + target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" + + define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 { + entry: + #dbg_value(i32 %x, !10, !DIExpression(), !11) + %add = add nsw i32 %x, 1, !dbg !12 + ret i32 0 + } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!2} + !llvm.ident = !{!3} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/") + !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/") + !2 = !{i32 2, !"Debug Info Version", i32 3} + !3 = !{!"clang"} + !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9) + !5 = !DIFile(filename: "/tmp/code.cpp", directory: "") + !6 = !DISubroutineType(types: !7) + !7 = !{!8, !8} + !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !9 = !{!10} + !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8) + !11 = !DILocation(line: 0, scope: !4, inlinedAt: !14) + !12 = !DILocation(line: 2, column: 11, scope: !13, inlinedAt: !15) + !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28) + !14 = !DILocation(line: 3, column: 2, scope: !4) + !15 = !DILocation(line: 4, column: 5, scope: !13, inlinedAt: !14) + +... +--- +name: _Z3fooi +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: _, preferred-register: '', flags: [ ] } + - { id: 1, class: _, preferred-register: '', flags: [ ] } + - { id: 2, class: _, preferred-register: '', flags: [ ] } + - { id: 3, class: _, preferred-register: '', flags: [ ] } +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $w0 + + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 1 + %3:_(s32) = G_CONSTANT i32 0 + DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11 + %2:_(s32) = nsw G_ADD %0, %1, debug-location !12 + $w0 = COPY %3(s32) + RET_ReallyLR implicit $w0 + )"; + auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", ""); + if (!TM) + return; + MachineModuleInfo MMI(TM.get()); + std::unique_ptr M = parseMIR(*TM, MIR, MMI, &C); + ASSERT_TRUE(M); + + DroppedVariableStatsMIR Stats; + auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi")); + Stats.runBeforePass("Test", MF); + + // This loop simulates an IR pass that drops debug information. + for (auto &MBB : *MF) { + for (auto &MI : MBB) { + if (MI.isDebugValueLike()) { + MI.eraseFromParent(); + break; + } + } + break; + } + + Stats.runAfterPass("Test", MF); + ASSERT_EQ(Stats.getPassDroppedVariables(), true); +} + +} // end anonymous namespace diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index 2fd52860e71b9..3a55d88f03d49 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -923,12 +923,13 @@ TEST_F(IRBuilderTest, DIBuilder) { { /* dbg.label | DbgLabelRecord */ // Insert before I and check order. - ExpectOrder(DIB.insertLabel(Label, LabelLoc, I), I->getIterator()); + ExpectOrder(DIB.insertLabel(Label, LabelLoc, I->getIterator()), + I->getIterator()); // We should be able to insert at the end of the block, even if there's // no terminator yet. Note that in RemoveDIs mode this record won't get // inserted into the block untill another instruction is added. - DbgInstPtr LabelRecord = DIB.insertLabel(Label, LabelLoc, BB); + DbgInstPtr LabelRecord = DIB.insertLabel(Label, LabelLoc, BB->end()); // Specifically do not insert a terminator, to check this works. `I` // should have absorbed the DbgLabelRecord in the new debug info mode. I = Builder.CreateAlloca(Builder.getInt32Ty()); @@ -945,7 +946,7 @@ TEST_F(IRBuilderTest, DIBuilder) { DIB.createAutoVariable(BarSP, "Y", File, 2, IntType, true); { /* dbg.value | DbgVariableRecord::Value */ ExpectOrder(DIB.insertDbgValueIntrinsic(I, VarX, DIB.createExpression(), - VarLoc, I), + VarLoc, I->getIterator()), I->getIterator()); // Check inserting at end of the block works as with labels. DbgInstPtr VarXValue = DIB.insertDbgValueIntrinsic( @@ -955,7 +956,8 @@ TEST_F(IRBuilderTest, DIBuilder) { EXPECT_EQ(BB->getTrailingDbgRecords(), nullptr); } { /* dbg.declare | DbgVariableRecord::Declare */ - ExpectOrder(DIB.insertDeclare(I, VarY, DIB.createExpression(), VarLoc, I), + ExpectOrder(DIB.insertDeclare(I, VarY, DIB.createExpression(), VarLoc, + I->getIterator()), I->getIterator()); // Check inserting at end of the block works as with labels. DbgInstPtr VarYDeclare = diff --git a/llvm/unittests/Support/ModRefTest.cpp b/llvm/unittests/Support/ModRefTest.cpp index 35107e50b32db..9c13908da44bb 100644 --- a/llvm/unittests/Support/ModRefTest.cpp +++ b/llvm/unittests/Support/ModRefTest.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/ModRef.h" -#include "llvm/ADT/SmallString.h" #include "llvm/Support/raw_ostream.h" #include "gtest/gtest.h" #include @@ -21,7 +20,8 @@ TEST(ModRefTest, PrintMemoryEffects) { std::string S; raw_string_ostream OS(S); OS << MemoryEffects::none(); - EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, Other: NoModRef"); + EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, ErrnoMem: " + "NoModRef, Other: NoModRef"); } } // namespace diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp index f2b73c282b764..03769ff59e372 100644 --- a/llvm/unittests/Transforms/Utils/CloningTest.cpp +++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp @@ -508,7 +508,7 @@ class CloneFunc : public ::testing::Test { auto *Variable = DBuilder.createAutoVariable(Subprogram, "x", File, 5, IntType, true); auto *DL = DILocation::get(Subprogram->getContext(), 5, 0, Subprogram); - DBuilder.insertDeclare(Alloca, Variable, E, DL, Store); + DBuilder.insertDeclare(Alloca, Variable, E, DL, Store->getIterator()); DBuilder.insertDbgValueIntrinsic(AllocaContent, Variable, E, DL, Entry); // Also create an inlined variable. // Create a distinct struct type that we should not duplicate during @@ -528,7 +528,8 @@ class CloneFunc : public ::testing::Test { Subprogram->getContext(), 9, 4, Scope, DILocation::get(Subprogram->getContext(), 5, 2, Subprogram)); IBuilder.SetCurrentDebugLocation(InlinedDL); - DBuilder.insertDeclare(Alloca, InlinedVar, E, InlinedDL, Store); + DBuilder.insertDeclare(Alloca, InlinedVar, E, InlinedDL, + Store->getIterator()); IBuilder.CreateStore(IBuilder.getInt32(2), Alloca); // Finalize the debug info. DBuilder.finalize(); diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp index f1e9afefb4531..37f29428e900a 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp @@ -1013,3 +1013,42 @@ define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %arg) { EXPECT_EQ(S2N->getNextNode(), S1N); EXPECT_EQ(S1N->getNextNode(), nullptr); } + +// Extending an "Old" interval with no mem instructions. +TEST_F(DependencyGraphTest, ExtendDAGWithNoMem) { + parseIR(C, R"IR( +define void @foo(ptr %ptr, i8 %v, i8 %v0, i8 %v1, i8 %v2, i8 %v3) { + store i8 %v0, ptr %ptr + store i8 %v1, ptr %ptr + %zext1 = zext i8 %v to i32 + %zext2 = zext i8 %v to i32 + store i8 %v2, ptr %ptr + store i8 %v3, ptr %ptr + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *S0 = cast(&*It++); + auto *S1 = cast(&*It++); + auto *Z1 = cast(&*It++); + auto *Z2 = cast(&*It++); + auto *S2 = cast(&*It++); + auto *S3 = cast(&*It++); + + sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx); + // Create a non-empty DAG that contains no memory instructions. + DAG.extend({Z1, Z2}); + // Now extend it downwards. + DAG.extend({S2, S3}); + EXPECT_TRUE(memDependency(DAG.getNode(S2), DAG.getNode(S3))); + + // Same but upwards. + DAG.clear(); + DAG.extend({Z1, Z2}); + DAG.extend({S0, S1}); + EXPECT_TRUE(memDependency(DAG.getNode(S0), DAG.getNode(S1))); +} diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp index 1d7c8f9cdde04..5b033f0edcb02 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp @@ -85,3 +85,30 @@ define void @foo(i8 %v0, i8 %v1, i8 %v2, i8 %v3, <2 x i8> %vec) { EXPECT_FALSE(IMaps.getOrigLane(VAdd0, Add1)); EXPECT_EQ(IMaps.getVectorForOrig(Add1), nullptr); } + +TEST_F(InstrMapsTest, VectorLanes) { + parseIR(C, R"IR( +define void @foo(<2 x i8> %v0, <2 x i8> %v1, <4 x i8> %v2, <4 x i8> %v3) { + %vadd0 = add <2 x i8> %v0, %v1 + %vadd1 = add <2 x i8> %v0, %v1 + %vadd2 = add <4 x i8> %v2, %v3 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + + auto *VAdd0 = cast(&*It++); + auto *VAdd1 = cast(&*It++); + auto *VAdd2 = cast(&*It++); + + sandboxir::InstrMaps IMaps(Ctx); + + // Check that the vector lanes are calculated correctly. + IMaps.registerVector({VAdd0, VAdd1}, VAdd2); + EXPECT_EQ(*IMaps.getOrigLane(VAdd2, VAdd0), 0U); + EXPECT_EQ(*IMaps.getOrigLane(VAdd2, VAdd1), 2U); +} diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp index 373af27ffbff0..0d5d86acaee89 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp @@ -253,6 +253,7 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v0, i8 %v1) { %add0 = add i8 %v0, 0 %add1 = add i8 %v1, 1 br label %bb1 + bb1: store i8 %add0, ptr %ptr0 store i8 %add1, ptr %ptr1 @@ -392,3 +393,77 @@ define void @foo(ptr %ptr) { EXPECT_TRUE(ReadyList.empty()); EXPECT_THAT(Nodes, testing::UnorderedElementsAre(L0N, RetN)); } + +TEST_F(SchedulerTest, ReadyListPriorities) { + parseIR(C, R"IR( +define void @foo(ptr %ptr) { +bb0: + br label %bb1 + +bb1: + %phi0 = phi i8 [0, %bb0], [1, %bb1] + %phi1 = phi i8 [0, %bb0], [1, %bb1] + %ld0 = load i8, ptr %ptr + store i8 %ld0, ptr %ptr + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB1 = getBasicBlockByName(F, "bb1"); + auto It = BB1->begin(); + auto *Phi0 = cast(&*It++); + auto *Phi1 = cast(&*It++); + auto *L0 = cast(&*It++); + auto *S0 = cast(&*It++); + auto *Ret = cast(&*It++); + + sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx); + DAG.extend({&*BB1->begin(), BB1->getTerminator()}); + auto *Phi0N = DAG.getNode(Phi0); + auto *Phi1N = DAG.getNode(Phi1); + auto *L0N = DAG.getNode(L0); + auto *S0N = DAG.getNode(S0); + auto *RetN = DAG.getNode(Ret); + + sandboxir::ReadyListContainer ReadyList; + // Check PHI vs non-PHI. + ReadyList.insert(S0N); + ReadyList.insert(Phi0N); + EXPECT_EQ(ReadyList.pop(), Phi0N); + EXPECT_EQ(ReadyList.pop(), S0N); + ReadyList.insert(Phi0N); + ReadyList.insert(S0N); + EXPECT_EQ(ReadyList.pop(), Phi0N); + EXPECT_EQ(ReadyList.pop(), S0N); + // Check PHI vs terminator. + ReadyList.insert(RetN); + ReadyList.insert(Phi1N); + EXPECT_EQ(ReadyList.pop(), Phi1N); + EXPECT_EQ(ReadyList.pop(), RetN); + ReadyList.insert(Phi1N); + ReadyList.insert(RetN); + EXPECT_EQ(ReadyList.pop(), Phi1N); + EXPECT_EQ(ReadyList.pop(), RetN); + // Check terminator vs non-terminator. + ReadyList.insert(RetN); + ReadyList.insert(L0N); + EXPECT_EQ(ReadyList.pop(), L0N); + EXPECT_EQ(ReadyList.pop(), RetN); + ReadyList.insert(L0N); + ReadyList.insert(RetN); + EXPECT_EQ(ReadyList.pop(), L0N); + EXPECT_EQ(ReadyList.pop(), RetN); + // Check all, program order. + ReadyList.insert(RetN); + ReadyList.insert(L0N); + ReadyList.insert(Phi1N); + ReadyList.insert(S0N); + ReadyList.insert(Phi0N); + EXPECT_EQ(ReadyList.pop(), Phi0N); + EXPECT_EQ(ReadyList.pop(), Phi1N); + EXPECT_EQ(ReadyList.pop(), L0N); + EXPECT_EQ(ReadyList.pop(), S0N); + EXPECT_EQ(ReadyList.pop(), RetN); +} diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index a46a83ea12284..3ea76ed414d91 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -283,20 +283,23 @@ void InstrInfoEmitter::emitOperandNameMappings( OS << "#ifdef GET_INSTRINFO_OPERAND_ENUM\n"; OS << "#undef GET_INSTRINFO_OPERAND_ENUM\n"; - OS << "namespace llvm::" << Namespace << "::OpName {\n"; - OS << "enum {\n"; + OS << "namespace llvm::" << Namespace << " {\n"; + OS << "enum class OpName {\n"; for (const auto &[I, Op] : enumerate(OperandNameToID)) OS << " " << Op.first << " = " << I << ",\n"; - OS << " OPERAND_LAST = " << NumOperandNames << ",\n"; - OS << "};\n"; - OS << "} // end namespace llvm::" << Namespace << "::OpName\n"; + OS << " NUM_OPERAND_NAMES = " << NumOperandNames << ",\n"; + OS << "}; // enum class OpName\n\n"; + OS << "LLVM_READONLY\n"; + OS << "int16_t getNamedOperandIdx(uint16_t Opcode, OpName Name);\n"; + OS << "} // end namespace llvm::" << Namespace << '\n'; OS << "#endif //GET_INSTRINFO_OPERAND_ENUM\n\n"; OS << "#ifdef GET_INSTRINFO_NAMED_OPS\n"; OS << "#undef GET_INSTRINFO_NAMED_OPS\n"; OS << "namespace llvm::" << Namespace << " {\n"; OS << "LLVM_READONLY\n"; - OS << "int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx) {\n"; + OS << "int16_t getNamedOperandIdx(uint16_t Opcode, OpName Name) {\n"; + OS << " assert(Name != OpName::NUM_OPERAND_NAMES);\n"; if (NumOperandNames != 0) { assert(MaxOperandNo <= INT16_MAX && "Too many operands for the operand name -> index table"); @@ -320,7 +323,8 @@ void InstrInfoEmitter::emitOperandNameMappings( for (const auto &[TableIndex, Entry] : enumerate(OperandMap)) { for (StringRef Name : Entry.second) OS << " case " << Namespace << "::" << Name << ":\n"; - OS << " return OperandMap[" << TableIndex << "][NamedIdx];\n"; + OS << " return OperandMap[" << TableIndex + << "][static_cast(Name)];\n"; } OS << " default: return -1;\n"; OS << " }\n"; diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 01e83f4d9d41d..2f9ec2e6e7a22 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -660,17 +660,17 @@ static void printMask(raw_ostream &OS, LaneBitmask Val) { // Try to combine Idx's compose map into Vec if it is compatible. // Return false if it's not possible. static bool combine(const CodeGenSubRegIndex *Idx, - SmallVectorImpl &Vec) { + SmallVectorImpl &Vec) { const CodeGenSubRegIndex::CompMap &Map = Idx->getComposites(); for (const auto &I : Map) { - CodeGenSubRegIndex *&Entry = Vec[I.first->EnumValue - 1]; + const CodeGenSubRegIndex *&Entry = Vec[I.first->EnumValue - 1]; if (Entry && Entry != I.second) return false; } // All entries are compatible. Make it so. for (const auto &I : Map) { - auto *&Entry = Vec[I.first->EnumValue - 1]; + const CodeGenSubRegIndex *&Entry = Vec[I.first->EnumValue - 1]; assert((!Entry || Entry == I.second) && "Expected EnumValue to be unique"); Entry = I.second; } @@ -692,7 +692,7 @@ void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, // Map each Sub-register index to a compatible table row. SmallVector RowMap; - SmallVector, 4> Rows; + SmallVector, 4> Rows; auto SubRegIndicesSize = std::distance(SubRegIndices.begin(), SubRegIndices.end()); diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py index 48a338aca9c8e..cb1e56859d083 100755 --- a/llvm/utils/git/code-format-helper.py +++ b/llvm/utils/git/code-format-helper.py @@ -353,6 +353,8 @@ def pr_comment_text_for_diff(self, diff: str) -> str: def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str]: files = self.filter_changed_files(changed_files) + if not files: + return None # Use git to find files that have had a change in the number of undefs regex = "([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)" @@ -379,10 +381,6 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str # Each file is prefixed like: # diff --git a/file b/file for file in re.split("^diff --git ", stdout, 0, re.MULTILINE): - # We skip checking in MIR files as undef is a valid token and not - # going away. - if file.endswith(".mir"): - continue # search for additions of undef if re.search(r"^[+](?!\s*#\s*).*(\bundef\b|UndefValue::get)", file, re.MULTILINE): files.append(re.match("a/([^ ]+)", file.splitlines()[0])[1]) diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index cb07575fa206c..a012bdefa06b1 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -290,6 +290,7 @@ write_cmake_config("llvm-config") { values = [ "LLVM_BUILD_LLVM_DYLIB=", "LLVM_BUILD_SHARED_LIBS=", + "LLVM_BUILD_TELEMETRY=", "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple", "LLVM_ENABLE_DUMP=", "LLVM_ENABLE_HTTPLIB=", diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn index 23f5d03583556..6125f0457933d 100644 --- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn @@ -50,6 +50,7 @@ static_library("CodeGen") { "DFAPacketizer.cpp", "DeadMachineInstructionElim.cpp", "DetectDeadLanes.cpp", + "DroppedVariableStatsMIR.cpp", "DwarfEHPrepare.cpp", "EHContGuardCatchret.cpp", "EarlyIfConversion.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn index 4ad22724d6225..4f103d30f300b 100644 --- a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn @@ -40,6 +40,8 @@ static_library("IR") { "DiagnosticInfo.cpp", "DiagnosticPrinter.cpp", "Dominators.cpp", + "DroppedVariableStats.cpp", + "DroppedVariableStatsIR.cpp", "EHPersonalities.cpp", "FPEnv.cpp", "Function.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn index 0483400d74803..274f5b54345c7 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn @@ -21,7 +21,6 @@ static_library("Passes") { ] sources = [ "CodeGenPassBuilder.cpp", - "DroppedVariableStatsIR.cpp", "OptimizationLevel.cpp", "PassBuilder.cpp", "PassBuilderBindings.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn index 177ae3c3a7b29..e3095e2f3df26 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn @@ -143,6 +143,7 @@ static_library("LLVMRISCVCodeGen") { "RISCVTargetObjectFile.cpp", "RISCVTargetTransformInfo.cpp", "RISCVVLOptimizer.cpp", + "RISCVVMV0Elimination.cpp", "RISCVVectorMaskDAGMutation.cpp", "RISCVVectorPeephole.cpp", "RISCVZacasABIFix.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn index a3f89a5648cb5..2fbc127199609 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn @@ -26,6 +26,7 @@ unittest("CodeGenTests") { "CCStateTest.cpp", "DIEHashTest.cpp", "DIETest.cpp", + "DroppedVariableStatsMIRTest.cpp", "DwarfStringPoolEntryRefTest.cpp", "InstrRefLDVTest.cpp", "LexicalScopesTest.cpp", diff --git a/mlir/docs/BufferDeallocationInternals.md b/mlir/docs/BufferDeallocationInternals.md deleted file mode 100644 index 00830ba9d2dc2..0000000000000 --- a/mlir/docs/BufferDeallocationInternals.md +++ /dev/null @@ -1,705 +0,0 @@ -# Buffer Deallocation - Internals - -**Note:** This pass is deprecated. Please use the ownership-based buffer -deallocation pass instead. - -This section covers the internal functionality of the BufferDeallocation -transformation. The transformation consists of several passes. The main pass -called BufferDeallocation can be applied via “-buffer-deallocation” on MLIR -programs. - -[TOC] - -## Requirements - -In order to use BufferDeallocation on an arbitrary dialect, several control-flow -interfaces have to be implemented when using custom operations. This is -particularly important to understand the implicit control-flow dependencies -between different parts of the input program. Without implementing the following -interfaces, control-flow relations cannot be discovered properly and the -resulting program can become invalid: - -* Branch-like terminators should implement the `BranchOpInterface` to query - and manipulate associated operands. -* Operations involving structured control flow have to implement the - `RegionBranchOpInterface` to model inter-region control flow. -* Terminators yielding values to their parent operation (in particular in the - scope of nested regions within `RegionBranchOpInterface`-based operations), - should implement the `ReturnLike` trait to represent logical “value - returns”. - -Example dialects that are fully compatible are the “std” and “scf” dialects with -respect to all implemented interfaces. - -During Bufferization, we convert immutable value types (tensors) to mutable -types (memref). This conversion is done in several steps and in all of these -steps the IR has to fulfill SSA like properties. The usage of memref has to be -in the following consecutive order: allocation, write-buffer, read- buffer. In -this case, there are only buffer reads allowed after the initial full buffer -write is done. In particular, there must be no partial write to a buffer after -the initial write has been finished. However, partial writes in the initializing -is allowed (fill buffer step by step in a loop e.g.). This means, all buffer -writes needs to dominate all buffer reads. - -Example for breaking the invariant: - -```mlir -func.func @condBranch(%arg0: i1, %arg1: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - cf.cond_br %arg0, ^bb1, ^bb2 -^bb1: - cf.br ^bb3() -^bb2: - partial_write(%0, %0) - cf.br ^bb3() -^bb3(): - test.copy(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> () - return -} -``` - -The maintenance of the SSA like properties is only needed in the bufferization -process. Afterwards, for example in optimization processes, the property is no -longer needed. - -## Detection of Buffer Allocations - -The first step of the BufferDeallocation transformation is to identify -manageable allocation operations that implement the `SideEffects` interface. -Furthermore, these ops need to apply the effect `MemoryEffects::Allocate` to a -particular result value while not using the resource -`SideEffects::AutomaticAllocationScopeResource` (since it is currently reserved -for allocations, like `Alloca` that will be automatically deallocated by a -parent scope). Allocations that have not been detected in this phase will not be -tracked internally, and thus, not deallocated automatically. However, -BufferDeallocation is fully compatible with “hybrid” setups in which tracked and -untracked allocations are mixed: - -```mlir -func.func @mixedAllocation(%arg0: i1) { - %0 = memref.alloca() : memref<2xf32> // aliases: %2 - %1 = memref.alloc() : memref<2xf32> // aliases: %2 - cf.cond_br %arg0, ^bb1, ^bb2 -^bb1: - use(%0) - cf.br ^bb3(%0 : memref<2xf32>) -^bb2: - use(%1) - cf.br ^bb3(%1 : memref<2xf32>) -^bb3(%2: memref<2xf32>): - ... -} -``` - -Example of using a conditional branch with alloc and alloca. BufferDeallocation -can detect and handle the different allocation types that might be intermixed. - -Note: the current version does not support allocation operations returning -multiple result buffers. - -## Conversion from AllocOp to AllocaOp - -The PromoteBuffersToStack-pass converts AllocOps to AllocaOps, if possible. In -some cases, it can be useful to use such stack-based buffers instead of -heap-based buffers. The conversion is restricted to several constraints like: - -* Control flow -* Buffer Size -* Dynamic Size - -If a buffer is leaving a block, we are not allowed to convert it into an alloca. -If the size of the buffer is large, we could convert it, but regarding stack -overflow, it makes sense to limit the size of these buffers and only convert -small ones. The size can be set via a pass option. The current default value is -1KB. Furthermore, we can not convert buffers with dynamic size, since the -dimension is not known a priori. - -## Movement and Placement of Allocations - -Using the buffer hoisting pass, all buffer allocations are moved as far upwards -as possible in order to group them and make upcoming optimizations easier by -limiting the search space. Such a movement is shown in the following graphs. In -addition, we are able to statically free an alloc, if we move it into a -dominator of all of its uses. This simplifies further optimizations (e.g. buffer -fusion) in the future. However, movement of allocations is limited by external -data dependencies (in particular in the case of allocations of dynamically -shaped types). Furthermore, allocations can be moved out of nested regions, if -necessary. In order to move allocations to valid locations with respect to their -uses only, we leverage Liveness information. - -The following code snippets shows a conditional branch before running the -BufferHoisting pass: - -![branch_example_pre_move](/includes/img/branch_example_pre_move.svg) - -```mlir -func.func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - cf.cond_br %arg0, ^bb1, ^bb2 -^bb1: - cf.br ^bb3(%arg1 : memref<2xf32>) -^bb2: - %0 = memref.alloc() : memref<2xf32> // aliases: %1 - use(%0) - cf.br ^bb3(%0 : memref<2xf32>) -^bb3(%1: memref<2xf32>): // %1 could be %0 or %arg1 - test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () - return -} -``` - -Applying the BufferHoisting pass on this program results in the following piece -of code: - -![branch_example_post_move](/includes/img/branch_example_post_move.svg) - -```mlir -func.func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> // moved to bb0 - cf.cond_br %arg0, ^bb1, ^bb2 -^bb1: - cf.br ^bb3(%arg1 : memref<2xf32>) -^bb2: - use(%0) - cf.br ^bb3(%0 : memref<2xf32>) -^bb3(%1: memref<2xf32>): - test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () - return -} -``` - -The alloc is moved from bb2 to the beginning and it is passed as an argument to -bb3. - -The following example demonstrates an allocation using dynamically shaped types. -Due to the data dependency of the allocation to %0, we cannot move the -allocation out of bb2 in this case: - -```mlir -func.func @condBranchDynamicType( - %arg0: i1, - %arg1: memref, - %arg2: memref, - %arg3: index) { - cf.cond_br %arg0, ^bb1, ^bb2(%arg3: index) -^bb1: - cf.br ^bb3(%arg1 : memref) -^bb2(%0: index): - %1 = memref.alloc(%0) : memref // cannot be moved upwards to the data - // dependency to %0 - use(%1) - cf.br ^bb3(%1 : memref) -^bb3(%2: memref): - test.copy(%2, %arg2) : (memref, memref) -> () - return -} -``` - -## Introduction of Clones - -In order to guarantee that all allocated buffers are freed properly, we have to -pay attention to the control flow and all potential aliases a buffer allocation -can have. Since not all allocations can be safely freed with respect to their -aliases (see the following code snippet), it is often required to introduce -copies to eliminate them. Consider the following example in which the -allocations have already been placed: - -```mlir -func.func @branch(%arg0: i1) { - %0 = memref.alloc() : memref<2xf32> // aliases: %2 - cf.cond_br %arg0, ^bb1, ^bb2 -^bb1: - %1 = memref.alloc() : memref<2xf32> // resides here for demonstration purposes - // aliases: %2 - cf.br ^bb3(%1 : memref<2xf32>) -^bb2: - use(%0) - cf.br ^bb3(%0 : memref<2xf32>) -^bb3(%2: memref<2xf32>): - … - return -} -``` - -The first alloc can be safely freed after the live range of its post-dominator -block (bb3). The alloc in bb1 has an alias %2 in bb3 that also keeps this buffer -alive until the end of bb3. Since we cannot determine the actual branches that -will be taken at runtime, we have to ensure that all buffers are freed correctly -in bb3 regardless of the branches we will take to reach the exit block. This -makes it necessary to introduce a copy for %2, which allows us to free %alloc0 -in bb0 and %alloc1 in bb1. Afterwards, we can continue processing all aliases of -%2 (none in this case) and we can safely free %2 at the end of the sample -program. This sample demonstrates that not all allocations can be safely freed -in their associated post-dominator blocks. Instead, we have to pay attention to -all of their aliases. - -Applying the BufferDeallocation pass to the program above yields the following -result: - -```mlir -func.func @branch(%arg0: i1) { - %0 = memref.alloc() : memref<2xf32> - cf.cond_br %arg0, ^bb1, ^bb2 -^bb1: - %1 = memref.alloc() : memref<2xf32> - %3 = bufferization.clone %1 : (memref<2xf32>) -> (memref<2xf32>) - memref.dealloc %1 : memref<2xf32> // %1 can be safely freed here - cf.br ^bb3(%3 : memref<2xf32>) -^bb2: - use(%0) - %4 = bufferization.clone %0 : (memref<2xf32>) -> (memref<2xf32>) - cf.br ^bb3(%4 : memref<2xf32>) -^bb3(%2: memref<2xf32>): - … - memref.dealloc %2 : memref<2xf32> // free temp buffer %2 - memref.dealloc %0 : memref<2xf32> // %0 can be safely freed here - return -} -``` - -Note that a temporary buffer for %2 was introduced to free all allocations -properly. Note further that the unnecessary allocation of %3 can be easily -removed using one of the post-pass transformations or the canonicalization pass. - -The presented example also works with dynamically shaped types. - -BufferDeallocation performs a fix-point iteration taking all aliases of all -tracked allocations into account. We initialize the general iteration process -using all tracked allocations and their associated aliases. As soon as we -encounter an alias that is not properly dominated by our allocation, we mark -this alias as *critical* (needs to be freed and tracked by the internal -fix-point iteration). The following sample demonstrates the presence of critical -and non-critical aliases: - -![nested_branch_example_pre_move](/includes/img/nested_branch_example_pre_move.svg) - -```mlir -func.func @condBranchDynamicTypeNested( - %arg0: i1, - %arg1: memref, // aliases: %3, %4 - %arg2: memref, - %arg3: index) { - cf.cond_br %arg0, ^bb1, ^bb2(%arg3: index) -^bb1: - cf.br ^bb6(%arg1 : memref) -^bb2(%0: index): - %1 = memref.alloc(%0) : memref // cannot be moved upwards due to the data - // dependency to %0 - // aliases: %2, %3, %4 - use(%1) - cf.cond_br %arg0, ^bb3, ^bb4 -^bb3: - cf.br ^bb5(%1 : memref) -^bb4: - cf.br ^bb5(%1 : memref) -^bb5(%2: memref): // non-crit. alias of %1, since %1 dominates %2 - cf.br ^bb6(%2 : memref) -^bb6(%3: memref): // crit. alias of %arg1 and %2 (in other words %1) - cf.br ^bb7(%3 : memref) -^bb7(%4: memref): // non-crit. alias of %3, since %3 dominates %4 - test.copy(%4, %arg2) : (memref, memref) -> () - return -} -``` - -Applying BufferDeallocation yields the following output: - -![nested_branch_example_post_move](/includes/img/nested_branch_example_post_move.svg) - -```mlir -func.func @condBranchDynamicTypeNested( - %arg0: i1, - %arg1: memref, - %arg2: memref, - %arg3: index) { - cf.cond_br %arg0, ^bb1, ^bb2(%arg3 : index) -^bb1: - // temp buffer required due to alias %3 - %5 = bufferization.clone %arg1 : (memref) -> (memref) - cf.br ^bb6(%5 : memref) -^bb2(%0: index): - %1 = memref.alloc(%0) : memref - use(%1) - cf.cond_br %arg0, ^bb3, ^bb4 -^bb3: - cf.br ^bb5(%1 : memref) -^bb4: - cf.br ^bb5(%1 : memref) -^bb5(%2: memref): - %6 = bufferization.clone %1 : (memref) -> (memref) - memref.dealloc %1 : memref - cf.br ^bb6(%6 : memref) -^bb6(%3: memref): - cf.br ^bb7(%3 : memref) -^bb7(%4: memref): - test.copy(%4, %arg2) : (memref, memref) -> () - memref.dealloc %3 : memref // free %3, since %4 is a non-crit. alias of %3 - return -} -``` - -Since %3 is a critical alias, BufferDeallocation introduces an additional -temporary copy in all predecessor blocks. %3 has an additional (non-critical) -alias %4 that extends the live range until the end of bb7. Therefore, we can -free %3 after its last use, while taking all aliases into account. Note that %4 -does not need to be freed, since we did not introduce a copy for it. - -The actual introduction of buffer copies is done after the fix-point iteration -has been terminated and all critical aliases have been detected. A critical -alias can be either a block argument or another value that is returned by an -operation. Copies for block arguments are handled by analyzing all predecessor -blocks. This is primarily done by querying the `BranchOpInterface` of the -associated branch terminators that can jump to the current block. Consider the -following example which involves a simple branch and the critical block argument -%2: - -```mlir - custom.br ^bb1(..., %0, : ...) - ... - custom.br ^bb1(..., %1, : ...) - ... -^bb1(%2: memref<2xf32>): - ... -``` - -The `BranchOpInterface` allows us to determine the actual values that will be -passed to block bb1 and its argument %2 by analyzing its predecessor blocks. -Once we have resolved the values %0 and %1 (that are associated with %2 in this -sample), we can introduce a temporary buffer and clone its contents into the new -buffer. Afterwards, we rewire the branch operands to use the newly allocated -buffer instead. However, blocks can have implicitly defined predecessors by -parent ops that implement the `RegionBranchOpInterface`. This can be the case if -this block argument belongs to the entry block of a region. In this setting, we -have to identify all predecessor regions defined by the parent operation. For -every region, we need to get all terminator operations implementing the -`ReturnLike` trait, indicating that they can branch to our current block. -Finally, we can use a similar functionality as described above to add the -temporary copy. This time, we can modify the terminator operands directly -without touching a high-level interface. - -Consider the following inner-region control-flow sample that uses an imaginary -“custom.region_if” operation. It either executes the “then” or “else” region and -always continues to the “join” region. The “custom.region_if_yield” operation -returns a result to the parent operation. This sample demonstrates the use of -the `RegionBranchOpInterface` to determine predecessors in order to infer the -high-level control flow: - -```mlir -func.func @inner_region_control_flow( - %arg0 : index, - %arg1 : index) -> memref { - %0 = memref.alloc(%arg0, %arg0) : memref - %1 = custom.region_if %0 : memref -> (memref) - then(%arg2 : memref) { // aliases: %arg4, %1 - custom.region_if_yield %arg2 : memref - } else(%arg3 : memref) { // aliases: %arg4, %1 - custom.region_if_yield %arg3 : memref - } join(%arg4 : memref) { // aliases: %1 - custom.region_if_yield %arg4 : memref - } - return %1 : memref -} -``` - -![region_branch_example_pre_move](/includes/img/region_branch_example_pre_move.svg) - -Non-block arguments (other values) can become aliases when they are returned by -dialect-specific operations. BufferDeallocation supports this behavior via the -`RegionBranchOpInterface`. Consider the following example that uses an “scf.if” -operation to determine the value of %2 at runtime which creates an alias: - -```mlir -func.func @nested_region_control_flow(%arg0 : index, %arg1 : index) -> memref { - %0 = arith.cmpi "eq", %arg0, %arg1 : index - %1 = memref.alloc(%arg0, %arg0) : memref - %2 = scf.if %0 -> (memref) { - scf.yield %1 : memref // %2 will be an alias of %1 - } else { - %3 = memref.alloc(%arg0, %arg1) : memref // nested allocation in a div. - // branch - use(%3) - scf.yield %1 : memref // %2 will be an alias of %1 - } - return %2 : memref -} -``` - -In this example, a dealloc is inserted to release the buffer within the else -block since it cannot be accessed by the remainder of the program. Accessing the -`RegionBranchOpInterface`, allows us to infer that %2 is a non-critical alias of -%1 which does not need to be tracked. - -```mlir -func.func @nested_region_control_flow(%arg0: index, %arg1: index) -> memref { - %0 = arith.cmpi "eq", %arg0, %arg1 : index - %1 = memref.alloc(%arg0, %arg0) : memref - %2 = scf.if %0 -> (memref) { - scf.yield %1 : memref - } else { - %3 = memref.alloc(%arg0, %arg1) : memref - use(%3) - memref.dealloc %3 : memref // %3 can be safely freed here - scf.yield %1 : memref - } - return %2 : memref -} -``` - -Analogous to the previous case, we have to detect all terminator operations in -all attached regions of “scf.if” that provides a value to its parent operation -(in this sample via scf.yield). Querying the `RegionBranchOpInterface` allows us -to determine the regions that “return” a result to their parent operation. Like -before, we have to update all `ReturnLike` terminators as described above. -Reconsider a slightly adapted version of the “custom.region_if” example from -above that uses a nested allocation: - -```mlir -func.func @inner_region_control_flow_div( - %arg0 : index, - %arg1 : index) -> memref { - %0 = memref.alloc(%arg0, %arg0) : memref - %1 = custom.region_if %0 : memref -> (memref) - then(%arg2 : memref) { // aliases: %arg4, %1 - custom.region_if_yield %arg2 : memref - } else(%arg3 : memref) { - %2 = memref.alloc(%arg0, %arg1) : memref // aliases: %arg4, %1 - custom.region_if_yield %2 : memref - } join(%arg4 : memref) { // aliases: %1 - custom.region_if_yield %arg4 : memref - } - return %1 : memref -} -``` - -Since the allocation %2 happens in a divergent branch and cannot be safely -deallocated in a post-dominator, %arg4 will be considered a critical alias. -Furthermore, %arg4 is returned to its parent operation and has an alias %1. This -causes BufferDeallocation to introduce additional copies: - -```mlir -func.func @inner_region_control_flow_div( - %arg0 : index, - %arg1 : index) -> memref { - %0 = memref.alloc(%arg0, %arg0) : memref - %1 = custom.region_if %0 : memref -> (memref) - then(%arg2 : memref) { - %4 = bufferization.clone %arg2 : (memref) -> (memref) - custom.region_if_yield %4 : memref - } else(%arg3 : memref) { - %2 = memref.alloc(%arg0, %arg1) : memref - %5 = bufferization.clone %2 : (memref) -> (memref) - memref.dealloc %2 : memref - custom.region_if_yield %5 : memref - } join(%arg4: memref) { - %4 = bufferization.clone %arg4 : (memref) -> (memref) - memref.dealloc %arg4 : memref - custom.region_if_yield %4 : memref - } - memref.dealloc %0 : memref // %0 can be safely freed here - return %1 : memref -} -``` - -## Placement of Deallocs - -After introducing allocs and copies, deallocs have to be placed to free -allocated memory and avoid memory leaks. The deallocation needs to take place -after the last use of the given value. The position can be determined by -calculating the common post-dominator of all values using their remaining -non-critical aliases. A special-case is the presence of back edges: since such -edges can cause memory leaks when a newly allocated buffer flows back to another -part of the program. In these cases, we need to free the associated buffer -instances from the previous iteration by inserting additional deallocs. - -Consider the following “scf.for” use case containing a nested structured -control-flow if: - -```mlir -func.func @loop_nested_if( - %lb: index, - %ub: index, - %step: index, - %buf: memref<2xf32>, - %res: memref<2xf32>) { - %0 = scf.for %i = %lb to %ub step %step - iter_args(%iterBuf = %buf) -> memref<2xf32> { - %1 = arith.cmpi "eq", %i, %ub : index - %2 = scf.if %1 -> (memref<2xf32>) { - %3 = memref.alloc() : memref<2xf32> // makes %2 a critical alias due to a - // divergent allocation - use(%3) - scf.yield %3 : memref<2xf32> - } else { - scf.yield %iterBuf : memref<2xf32> - } - scf.yield %2 : memref<2xf32> - } - test.copy(%0, %res) : (memref<2xf32>, memref<2xf32>) -> () - return -} -``` - -In this example, the *then* branch of the nested “scf.if” operation returns a -newly allocated buffer. - -Since this allocation happens in the scope of a divergent branch, %2 becomes a -critical alias that needs to be handled. As before, we have to insert additional -copies to eliminate this alias using copies of %3 and %iterBuf. This guarantees -that %2 will be a newly allocated buffer that is returned in each iteration. -However, “returning” %2 to its alias %iterBuf turns %iterBuf into a critical -alias as well. In other words, we have to create a copy of %2 to pass it to -%iterBuf. Since this jump represents a back edge, and %2 will always be a new -buffer, we have to free the buffer from the previous iteration to avoid memory -leaks: - -```mlir -func.func @loop_nested_if( - %lb: index, - %ub: index, - %step: index, - %buf: memref<2xf32>, - %res: memref<2xf32>) { - %4 = bufferization.clone %buf : (memref<2xf32>) -> (memref<2xf32>) - %0 = scf.for %i = %lb to %ub step %step - iter_args(%iterBuf = %4) -> memref<2xf32> { - %1 = arith.cmpi "eq", %i, %ub : index - %2 = scf.if %1 -> (memref<2xf32>) { - %3 = memref.alloc() : memref<2xf32> // makes %2 a critical alias - use(%3) - %5 = bufferization.clone %3 : (memref<2xf32>) -> (memref<2xf32>) - memref.dealloc %3 : memref<2xf32> - scf.yield %5 : memref<2xf32> - } else { - %6 = bufferization.clone %iterBuf : (memref<2xf32>) -> (memref<2xf32>) - scf.yield %6 : memref<2xf32> - } - %7 = bufferization.clone %2 : (memref<2xf32>) -> (memref<2xf32>) - memref.dealloc %2 : memref<2xf32> - memref.dealloc %iterBuf : memref<2xf32> // free backedge iteration variable - scf.yield %7 : memref<2xf32> - } - test.copy(%0, %res) : (memref<2xf32>, memref<2xf32>) -> () - memref.dealloc %0 : memref<2xf32> // free temp copy %0 - return -} -``` - -Example for loop-like control flow. The CFG contains back edges that have to be -handled to avoid memory leaks. The bufferization is able to free the backedge -iteration variable %iterBuf. - -## Private Analyses Implementations - -The BufferDeallocation transformation relies on one primary control-flow -analysis: BufferPlacementAliasAnalysis. Furthermore, we also use dominance and -liveness to place and move nodes. The liveness analysis determines the live -range of a given value. Within this range, a value is alive and can or will be -used in the course of the program. After this range, the value is dead and can -be discarded - in our case, the buffer can be freed. To place the allocs, we -need to know from which position a value will be alive. The allocs have to be -placed in front of this position. However, the most important analysis is the -alias analysis that is needed to introduce copies and to place all -deallocations. - -# Post Phase - -In order to limit the complexity of the BufferDeallocation transformation, some -tiny code-polishing/optimization transformations are not applied on-the-fly -during placement. Currently, a canonicalization pattern is added to the clone -operation to reduce the appearance of unnecessary clones. - -Note: further transformations might be added to the post-pass phase in the -future. - -## Clone Canonicalization - -During placement of clones it may happen, that unnecessary clones are inserted. -If these clones appear with their corresponding dealloc operation within the -same block, we can use the canonicalizer to remove these unnecessary operations. -Note, that this step needs to take place after the insertion of clones and -deallocs in the buffer deallocation step. The canonicalization inludes both, the -newly created target value from the clone operation and the source operation. - -## Canonicalization of the Source Buffer of the Clone Operation - -In this case, the source of the clone operation can be used instead of its -target. The unused allocation and deallocation operations that are defined for -this clone operation are also removed. Here is a working example generated by -the BufferDeallocation pass that allocates a buffer with dynamic size. A deeper -analysis of this sample reveals that the highlighted operations are redundant -and can be removed. - -```mlir -func.func @dynamic_allocation(%arg0: index, %arg1: index) -> memref { - %1 = memref.alloc(%arg0, %arg1) : memref - %2 = bufferization.clone %1 : (memref) -> (memref) - memref.dealloc %1 : memref - return %2 : memref -} -``` - -Will be transformed to: - -```mlir -func.func @dynamic_allocation(%arg0: index, %arg1: index) -> memref { - %1 = memref.alloc(%arg0, %arg1) : memref - return %1 : memref -} -``` - -In this case, the additional copy %2 can be replaced with its original source -buffer %1. This also applies to the associated dealloc operation of %1. - -## Canonicalization of the Target Buffer of the Clone Operation - -In this case, the target buffer of the clone operation can be used instead of -its source. The unused deallocation operation that is defined for this clone -operation is also removed. - -Consider the following example where a generic test operation writes the result -to %temp and then copies %temp to %result. However, these two operations can be -merged into a single step. Canonicalization removes the clone operation and -%temp, and replaces the uses of %temp with %result: - -```mlir -func.func @reuseTarget(%arg0: memref<2xf32>, %result: memref<2xf32>){ - %temp = memref.alloc() : memref<2xf32> - test.generic { - args_in = 1 : i64, - args_out = 1 : i64, - indexing_maps = [#map0, #map0], - iterator_types = ["parallel"]} %arg0, %temp { - ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): - %tmp2 = math.exp %gen2_arg0 : f32 - test.yield %tmp2 : f32 - }: memref<2xf32>, memref<2xf32> - %result = bufferization.clone %temp : (memref<2xf32>) -> (memref<2xf32>) - memref.dealloc %temp : memref<2xf32> - return -} -``` - -Will be transformed to: - -```mlir -func.func @reuseTarget(%arg0: memref<2xf32>, %result: memref<2xf32>){ - test.generic { - args_in = 1 : i64, - args_out = 1 : i64, - indexing_maps = [#map0, #map0], - iterator_types = ["parallel"]} %arg0, %result { - ^bb0(%gen2_arg0: f32, %gen2_arg1: f32): - %tmp2 = math.exp %gen2_arg0 : f32 - test.yield %tmp2 : f32 - }: memref<2xf32>, memref<2xf32> - return -} -``` - -## Known Limitations - -BufferDeallocation introduces additional clones from “memref” dialect -(“bufferization.clone”). Analogous, all deallocations use the “memref” -dialect-free operation “memref.dealloc”. The actual copy process is realized -using “test.copy”. Furthermore, buffers are essentially immutable after their -creation in a block. Another limitations are known in the case using -unstructered control flow. diff --git a/mlir/docs/OwnershipBasedBufferDeallocation.md b/mlir/docs/OwnershipBasedBufferDeallocation.md index 9036c811c5daf..f5fa01c4c49cc 100644 --- a/mlir/docs/OwnershipBasedBufferDeallocation.md +++ b/mlir/docs/OwnershipBasedBufferDeallocation.md @@ -5,9 +5,7 @@ One-Shot Bufferize does not deallocate any buffers that it allocates. After running One-Shot Bufferize, the resulting IR may have a number of `memref.alloc` ops, but no `memref.dealloc` ops. Buffer dellocation is delegated to the -`-ownership-based-buffer-deallocation` pass. This pass supersedes the now -deprecated `-buffer-deallocation` pass, which does not work well with -One-Shot Bufferize. +`-ownership-based-buffer-deallocation` pass. On a high level, buffers are "owned" by a basic block. Ownership materializes as an `i1` SSA value and can be thought of as "responsibility to deallocate". It diff --git a/mlir/docs/includes/img/branch_example_post_move.svg b/mlir/docs/includes/img/branch_example_post_move.svg deleted file mode 100644 index 870df495a13c6..0000000000000 --- a/mlir/docs/includes/img/branch_example_post_move.svg +++ /dev/null @@ -1,419 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - in: %arg0, %arg1, %arg2 - bb0 - - bb2 - bb1 - - bb3 (%1) - -use(%0) -cf.br bb3(%0) - - copy (%1, arg2) - %0 - %arg1 - %0 = memref.alloc() - - diff --git a/mlir/docs/includes/img/branch_example_pre_move.svg b/mlir/docs/includes/img/branch_example_pre_move.svg deleted file mode 100644 index 5eb15fd13946e..0000000000000 --- a/mlir/docs/includes/img/branch_example_pre_move.svg +++ /dev/null @@ -1,409 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - in: %arg0, %arg1, %arg2 - bb0 - - bb2 - bb1 - - bb3 (%1) - %0 = memref.alloc() -use(%0) -cf.br bb3(%0) - - copy (%1, arg2) - %0 - %arg1 - - diff --git a/mlir/docs/includes/img/nested_branch_example_post_move.svg b/mlir/docs/includes/img/nested_branch_example_post_move.svg deleted file mode 100644 index 27923627ad3d2..0000000000000 --- a/mlir/docs/includes/img/nested_branch_example_post_move.svg +++ /dev/null @@ -1,759 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - in: %arg0, %arg1, %arg2, %arg3 - bb0 - - bb1 - - - %5 - - - - - bb2 (%0) - bb4 - bb3 - - bb5 (%2) - %1 - - bb7 (%4) - - - - - - - - - bb6 (%3) - %1 - %6 - - %1 = memref.alloc(%0)use(%1) - - copy(%4, %arg2)dealloc %3 - %3 - %5 = memref.alloc(%d0)copy(%arg1, %5) - %6 = memref.alloc(%d1)copy(%1, %6)dealloc %1 - - diff --git a/mlir/docs/includes/img/nested_branch_example_pre_move.svg b/mlir/docs/includes/img/nested_branch_example_pre_move.svg deleted file mode 100644 index 9f2c603511f84..0000000000000 --- a/mlir/docs/includes/img/nested_branch_example_pre_move.svg +++ /dev/null @@ -1,717 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - in: %arg0, %arg1, %arg2, %arg3 - bb0 - - bb1 - - - %arg1 - - - - - bb2 (%0) - bb4 - bb3 - - bb5 (%2) - %1 - - bb7 (%4) - - - - - - - - - bb6 (%3) - %1 - %2 - - %1 = memref.alloc(%0)use(%0) - - copy(%4, %arg2) - %3 - - diff --git a/mlir/docs/includes/img/region_branch_example_pre_move.svg b/mlir/docs/includes/img/region_branch_example_pre_move.svg deleted file mode 100644 index 79c83fbe35a9e..0000000000000 --- a/mlir/docs/includes/img/region_branch_example_pre_move.svg +++ /dev/null @@ -1,435 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - %0 - if - - then - %1 - else - - join - %arg4 - %arg4 - %arg2 - %arg3 - - - diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h index 7d2fd89e8560f..14ccae650606a 100644 --- a/mlir/include/mlir-c/IR.h +++ b/mlir/include/mlir-c/IR.h @@ -309,6 +309,10 @@ MLIR_CAPI_EXPORTED MlirModule mlirModuleCreateEmpty(MlirLocation location); MLIR_CAPI_EXPORTED MlirModule mlirModuleCreateParse(MlirContext context, MlirStringRef module); +/// Parses a module from file and transfers ownership to the caller. +MLIR_CAPI_EXPORTED MlirModule +mlirModuleCreateParseFromFile(MlirContext context, MlirStringRef fileName); + /// Gets the context that a module was created with. MLIR_CAPI_EXPORTED MlirContext mlirModuleGetContext(MlirModule module); diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h index bc29d04287ac4..ea5034b60d8bd 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.h +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -14,6 +14,7 @@ #ifndef MLIR_DIALECT_AFFINE_PASSES_H #define MLIR_DIALECT_AFFINE_PASSES_H +#include "mlir/Interfaces/FunctionInterfaces.h" #include "mlir/Pass/Pass.h" #include @@ -93,7 +94,7 @@ std::unique_ptr> createLoopTilingPass(); /// factors supplied through other means. If -1 is passed as the unrollFactor /// and no callback is provided, anything passed from the command-line (if at /// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor). -std::unique_ptr> createLoopUnrollPass( +std::unique_ptr> createLoopUnrollPass( int unrollFactor = -1, bool unrollUpToFactor = false, bool unrollFull = false, const std::function &getUnrollFactor = nullptr); diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td index d7c7897c65730..5325d3b0a1d69 100644 --- a/mlir/include/mlir/Dialect/Affine/Passes.td +++ b/mlir/include/mlir/Dialect/Affine/Passes.td @@ -199,7 +199,7 @@ def AffineLoopTiling : Pass<"affine-loop-tile", "func::FuncOp"> { ]; } -def AffineLoopUnroll : Pass<"affine-loop-unroll", "func::FuncOp"> { +def AffineLoopUnroll : InterfacePass<"affine-loop-unroll", "FunctionOpInterface"> { let summary = "Unroll affine loops"; let constructor = "mlir::affine::createLoopUnrollPass()"; let options = [ diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/ShardingInterfaceImpl.h b/mlir/include/mlir/Dialect/Arith/Transforms/ShardingInterfaceImpl.h new file mode 100644 index 0000000000000..5addffbe571be --- /dev/null +++ b/mlir/include/mlir/Dialect/Arith/Transforms/ShardingInterfaceImpl.h @@ -0,0 +1,23 @@ +//===- ShardingInterfaceImpl.h - ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_ARITH_TRANSFORMS_SHARDINGINTERFACEIMPL_H_ +#define MLIR_DIALECT_ARITH_TRANSFORMS_SHARDINGINTERFACEIMPL_H_ + +namespace mlir { + +class DialectRegistry; + +namespace arith { + +void registerShardingInterfaceExternalModels(DialectRegistry ®istry); + +} // namespace arith +} // namespace mlir + +#endif // MLIR_DIALECT_ARITH_TRANSFORMS_SHARDINGINTERFACEIMPL_H_ diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h index c8e456a1d7e38..c5d0853d6ff97 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h @@ -30,10 +30,6 @@ using DeallocHelperMap = llvm::DenseMap; #define GEN_PASS_DECL #include "mlir/Dialect/Bufferization/Transforms/Passes.h.inc" -/// Creates an instance of the BufferDeallocation pass to free all allocated -/// buffers. -std::unique_ptr createBufferDeallocationPass(); - /// Creates an instance of the OwnershipBasedBufferDeallocation pass to free all /// allocated buffers. std::unique_ptr createOwnershipBasedBufferDeallocationPass( @@ -141,9 +137,6 @@ void populateBufferizationDeallocLoweringPattern( func::FuncOp buildDeallocationLibraryFunction(OpBuilder &builder, Location loc, SymbolTable &symbolTable); -/// Run buffer deallocation. -LogicalResult deallocateBuffers(Operation *op); - /// Run the ownership-based buffer deallocation. LogicalResult deallocateBuffersOwnershipBased(FunctionOpInterface op, DeallocationOptions options); diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td index 3bcde8edde509..f20f177d8443b 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td @@ -11,79 +11,6 @@ include "mlir/Pass/PassBase.td" -def BufferDeallocation : Pass<"buffer-deallocation", "func::FuncOp"> { - let summary = "Adds all required dealloc operations for all allocations in " - "the input program"; - let description = [{ - This pass implements an algorithm to automatically introduce all required - deallocation operations for all buffers in the input program. This ensures - that the resulting program does not have any memory leaks. - - - Input - - ```mlir - #map0 = affine_map<(d0) -> (d0)> - module { - func.func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - cf.cond_br %arg0, ^bb1, ^bb2 - ^bb1: - cf.br ^bb3(%arg1 : memref<2xf32>) - ^bb2: - %0 = memref.alloc() : memref<2xf32> - linalg.generic { - indexing_maps = [#map0, #map0], - iterator_types = ["parallel"]} %arg1, %0 { - ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): - %tmp1 = exp %gen1_arg0 : f32 - linalg.yield %tmp1 : f32 - }: memref<2xf32>, memref<2xf32> - cf.br ^bb3(%0 : memref<2xf32>) - ^bb3(%1: memref<2xf32>): - "memref.copy"(%1, %arg2) : (memref<2xf32>, memref<2xf32>) -> () - return - } - } - - ``` - - Output - - ```mlir - #map0 = affine_map<(d0) -> (d0)> - module { - func.func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - cf.cond_br %arg0, ^bb1, ^bb2 - ^bb1: // pred: ^bb0 - %0 = memref.alloc() : memref<2xf32> - memref.copy(%arg1, %0) : memref<2xf32>, memref<2xf32> - cf.br ^bb3(%0 : memref<2xf32>) - ^bb2: // pred: ^bb0 - %1 = memref.alloc() : memref<2xf32> - linalg.generic { - indexing_maps = [#map0, #map0], - iterator_types = ["parallel"]} %arg1, %1 { - ^bb0(%arg3: f32, %arg4: f32): - %4 = exp %arg3 : f32 - linalg.yield %4 : f32 - }: memref<2xf32>, memref<2xf32> - %2 = memref.alloc() : memref<2xf32> - memref.copy(%1, %2) : memref<2xf32>, memref<2xf32> - dealloc %1 : memref<2xf32> - cf.br ^bb3(%2 : memref<2xf32>) - ^bb3(%3: memref<2xf32>): // 2 preds: ^bb1, ^bb2 - memref.copy(%3, %arg2) : memref<2xf32>, memref<2xf32> - dealloc %3 : memref<2xf32> - return - } - - } - ``` - - }]; - let constructor = "mlir::bufferization::createBufferDeallocationPass()"; -} - def OwnershipBasedBufferDeallocation : Pass< "ownership-based-buffer-deallocation"> { let summary = "Adds all required dealloc operations for all allocations in " @@ -390,8 +317,9 @@ def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> { results in a new buffer allocation. One-Shot Bufferize does not deallocate any buffers that it allocates. The - `-buffer-deallocation` pass should be run after One-Shot Bufferize to insert - the deallocation operations necessary to eliminate memory leaks. + `-buffer-deallocation-pipeline` pipeline should be run after One-Shot + Bufferize to insert the deallocation operations necessary to eliminate + memory leaks. One-Shot Bufferize will by default reject IR that contains non-bufferizable op, i.e., ops that do not implemement BufferizableOpInterface. Such IR can diff --git a/mlir/include/mlir/Dialect/DLTI/DLTI.h b/mlir/include/mlir/Dialect/DLTI/DLTI.h index f268fea340a6f..84cf6eabc5fa9 100644 --- a/mlir/include/mlir/Dialect/DLTI/DLTI.h +++ b/mlir/include/mlir/Dialect/DLTI/DLTI.h @@ -28,6 +28,12 @@ namespace dlti { /// query interface-implementing attrs, starting from attr obtained from `op`. FailureOr query(Operation *op, ArrayRef keys, bool emitError = false); + +/// Perform a DLTI-query at `op` using each string in `keys` as a separate DLTI +/// entry key, recursively querying on query interface-implementing attrs, +/// starting from attr obtained from `op`. +FailureOr query(Operation *op, ArrayRef keys, + bool emitError = false); } // namespace dlti } // namespace mlir diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h index 75cb096130ca6..fc5cfffea27a7 100644 --- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h +++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h @@ -51,7 +51,7 @@ class MeshSharding { SmallVector dynamic_sharded_dims_offsets; public: - MeshSharding() = default; + MeshSharding(::mlir::FlatSymbolRefAttr mesh_ = nullptr); MeshSharding(Value rhs); static MeshSharding get(::mlir::FlatSymbolRefAttr mesh_, ArrayRef split_axes_, @@ -62,7 +62,7 @@ class MeshSharding { ArrayRef dynamic_halo_sizes_ = {}, ArrayRef dynamic_sharded_dims_offsets_ = {}); ::mlir::FlatSymbolRefAttr getMeshAttr() const { return mesh; } - ::llvm::StringRef getMesh() const { return mesh.getValue(); } + ::llvm::StringRef getMesh() const { return mesh ? mesh.getValue() : ""; } ArrayRef getSplitAxes() const { return split_axes; } ArrayRef getPartialAxes() const { return partial_axes; } ReductionKind getPartialType() const { return partial_type; } @@ -201,10 +201,12 @@ ShapedType shardShapedType(ShapedType shape, MeshOp mesh, Type shardType(Type type, MeshOp mesh, MeshSharding sharding); // Insert shard op if there is not one that already has the same sharding. +// Use newShardOp if it is not null. Otherwise create a new one. // May insert resharding if required. +// Potentially updates newShardOp. void maybeInsertTargetShardingAnnotation(MeshSharding sharding, - OpOperand &operand, - OpBuilder &builder); + OpOperand &operand, OpBuilder &builder, + ShardOp &newShardOp); void maybeInsertTargetShardingAnnotation(MeshSharding sharding, OpResult result, OpBuilder &builder); void maybeInsertSourceShardingAnnotation(MeshSharding sharding, diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td index 6039e61a93fad..031e6f63bcb42 100644 --- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td +++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td @@ -28,7 +28,7 @@ class Mesh_Op traits = []> : Op { } -def Mesh_MeshOp : Mesh_Op<"mesh", [Symbol]> { +def Mesh_MeshOp : Mesh_Op<"mesh", [Symbol, Pure]> { let summary = "Description of a device/process mesh."; let description = [{ The mesh.mesh operation is a symbol operation that identifies a specific @@ -318,12 +318,33 @@ def Mesh_ShardingOp : Mesh_Op<"sharding", [ "ArrayRef":$split_axes, "::mlir::ArrayRef<::mlir::OpFoldResult>":$halo_sizes, "::mlir::ArrayRef<::mlir::OpFoldResult>":$sharded_dims_offsets)>, + OpBuilder<(ins "llvm::StringRef":$mesh, + "ArrayRef":$split_axes, + CArg<"ArrayRef", "{}">:$static_halo_sizes, + CArg<"ArrayRef", "{}">:$static_sharded_dims_offsets + )>, OpBuilder<(ins "mlir::mesh::MeshSharding":$from)> ]; let hasVerifier = 1; let hasCanonicalizer = 1; } +def Mesh_GetShardingOp : Mesh_Op<"get_sharding", [Pure]> { + let summary = "Get the sharding of the given tensor."; + let description = [{ + This operation returns the sharding of the given tensor as a MeshSharding. + }]; + let arguments = (ins + AnyRankedTensor:$source + ); + let results = (outs + Mesh_Sharding:$result + ); + let assemblyFormat = [{ + $source attr-dict `:` type($source) `->` type($result) + }]; +} + def Mesh_ShardShapeOp : Mesh_Op<"shard_shape", [Pure]> { let summary = "Get the shard shape of a given process/device."; let description = [{ @@ -460,6 +481,7 @@ def Mesh_ShardOp : Mesh_Op<"shard", [ (`annotate_for_users` $annotate_for_users^)? attr-dict `:` type($result) }]; + let hasCanonicalizer = 1; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h index b4d25cef05a7b..14aad7f9f6783 100644 --- a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h +++ b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h @@ -36,7 +36,9 @@ struct ShardingOption { bool empty = false; ShardingOption() = default; ShardingOption(ShardingArray shardingArray, FlatSymbolRefAttr mesh) - : shardingArray(std::move(shardingArray)), mesh(mesh) {} + : shardingArray(std::move(shardingArray)), mesh(mesh) { + assert(this->mesh); + } static ShardingOption makeEmpty() { auto res = ShardingOption(); res.empty = true; diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 42da20251c190..70a2ba0919952 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -562,53 +562,33 @@ class OpenACC_DataEntryOp":$varPtr, - "bool":$structured, "bool":$implicit, - CArg<"::mlir::ValueRange", "{}">:$bounds), - [{ - build($_builder, $_state, varPtr.getType(), varPtr, - /*varType=*/::mlir::TypeAttr::get( - varPtr.getType().getElementType()), - /*varPtrPtr=*/{}, bounds, /*asyncOperands=*/{}, - /*asyncOperandsDeviceType=*/nullptr, - /*asyncOnly=*/nullptr, /*dataClause=*/nullptr, - /*structured=*/$_builder.getBoolAttr(structured), - /*implicit=*/$_builder.getBoolAttr(implicit), /*name=*/nullptr); - }]>, - OpBuilder<(ins "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$varPtr, - "bool":$structured, "bool":$implicit, - "const ::llvm::Twine &":$name, - CArg<"::mlir::ValueRange", "{}">:$bounds), - [{ - build($_builder, $_state, varPtr.getType(), varPtr, - /*varType=*/::mlir::TypeAttr::get( - varPtr.getType().getElementType()), - /*varPtrPtr=*/{}, bounds, /*asyncOperands=*/{}, - /*asyncOperandsDeviceType=*/nullptr, - /*asyncOnly=*/nullptr, /*dataClause=*/nullptr, - /*structured=*/$_builder.getBoolAttr(structured), - /*implicit=*/$_builder.getBoolAttr(implicit), - /*name=*/$_builder.getStringAttr(name)); - }]>, - OpBuilder<(ins "::mlir::TypedValue<::mlir::acc::MappableType>":$var, + OpBuilder<(ins "::mlir::Value":$var, "bool":$structured, "bool":$implicit, CArg<"::mlir::ValueRange", "{}">:$bounds), [{ + auto ptrLikeTy = ::mlir::dyn_cast<::mlir::acc::PointerLikeType>( + var.getType()); build($_builder, $_state, var.getType(), var, - /*varType=*/::mlir::TypeAttr::get(var.getType()), + /*varType=*/ptrLikeTy ? + ::mlir::TypeAttr::get(ptrLikeTy.getElementType()) : + ::mlir::TypeAttr::get(var.getType()), /*varPtrPtr=*/{}, bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr, /*asyncOnly=*/nullptr, /*dataClause=*/nullptr, /*structured=*/$_builder.getBoolAttr(structured), /*implicit=*/$_builder.getBoolAttr(implicit), /*name=*/nullptr); }]>, - OpBuilder<(ins "::mlir::TypedValue<::mlir::acc::MappableType>":$var, + OpBuilder<(ins "::mlir::Value":$var, "bool":$structured, "bool":$implicit, "const ::llvm::Twine &":$name, CArg<"::mlir::ValueRange", "{}">:$bounds), [{ + auto ptrLikeTy = ::mlir::dyn_cast<::mlir::acc::PointerLikeType>( + var.getType()); build($_builder, $_state, var.getType(), var, - /*varType=*/::mlir::TypeAttr::get(var.getType()), + /*varType=*/ptrLikeTy ? + ::mlir::TypeAttr::get(ptrLikeTy.getElementType()) : + ::mlir::TypeAttr::get(var.getType()), /*varPtrPtr=*/{}, bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr, /*asyncOnly=*/nullptr, /*dataClause=*/nullptr, @@ -942,28 +922,34 @@ class OpenACC_DataExitOpWithVarPtr }]; let builders = [ - OpBuilder<(ins "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$accPtr, - "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$varPtr, + OpBuilder<(ins "::mlir::Value":$accVar, + "::mlir::Value":$var, "bool":$structured, "bool":$implicit, CArg<"::mlir::ValueRange", "{}">:$bounds), [{ - build($_builder, $_state, accPtr, varPtr, - /*varType=*/::mlir::TypeAttr::get( - varPtr.getType().getElementType()), + auto ptrLikeTy = ::mlir::dyn_cast<::mlir::acc::PointerLikeType>( + var.getType()); + build($_builder, $_state, accVar, var, + /*varType=*/ptrLikeTy ? + ::mlir::TypeAttr::get(ptrLikeTy.getElementType()) : + ::mlir::TypeAttr::get(var.getType()), bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr, /*asyncOnly=*/nullptr, /*dataClause=*/nullptr, /*structured=*/$_builder.getBoolAttr(structured), /*implicit=*/$_builder.getBoolAttr(implicit), /*name=*/nullptr); }]>, - OpBuilder<(ins "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$accPtr, - "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$varPtr, + OpBuilder<(ins "::mlir::Value":$accVar, + "::mlir::Value":$var, "bool":$structured, "bool":$implicit, "const ::llvm::Twine &":$name, CArg<"::mlir::ValueRange", "{}">:$bounds), [{ - build($_builder, $_state, accPtr, varPtr, - /*varType=*/::mlir::TypeAttr::get( - varPtr.getType().getElementType()), + auto ptrLikeTy = ::mlir::dyn_cast<::mlir::acc::PointerLikeType>( + var.getType()); + build($_builder, $_state, accVar, var, + /*varType=*/ptrLikeTy ? + ::mlir::TypeAttr::get(ptrLikeTy.getElementType()) : + ::mlir::TypeAttr::get(var.getType()), bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr, /*asyncOnly=*/nullptr, /*dataClause=*/nullptr, /*structured=*/$_builder.getBoolAttr(structured), @@ -996,22 +982,22 @@ class OpenACC_DataExitOpNoVarPtr : }]; let builders = [ - OpBuilder<(ins "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$accPtr, + OpBuilder<(ins "::mlir::Value":$accVar, "bool":$structured, "bool":$implicit, CArg<"::mlir::ValueRange", "{}">:$bounds), [{ - build($_builder, $_state, accPtr, + build($_builder, $_state, accVar, bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr, /*asyncOnly=*/nullptr, /*dataClause=*/nullptr, /*structured=*/$_builder.getBoolAttr(structured), /*implicit=*/$_builder.getBoolAttr(implicit), /*name=*/nullptr); }]>, - OpBuilder<(ins "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$accPtr, + OpBuilder<(ins "::mlir::Value":$accVar, "bool":$structured, "bool":$implicit, "const ::llvm::Twine &":$name, CArg<"::mlir::ValueRange", "{}">:$bounds), [{ - build($_builder, $_state, accPtr, + build($_builder, $_state, accVar, bounds, /*asyncOperands=*/{}, /*asyncOperandsDeviceType=*/nullptr, /*asyncOnly=*/nullptr, /*dataClause=*/nullptr, /*structured=*/$_builder.getBoolAttr(structured), diff --git a/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h b/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h index 1bd0f6553fc8d..b3ec796a72337 100644 --- a/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h +++ b/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h @@ -116,6 +116,18 @@ bool canFoldIntoConsumerOp(CastOp castOp); /// this method provides a check that it is worth doing the canonicalization. bool canFoldIntoProducerOp(CastOp castOp); +/// Return true if any of the operands of `op` is a CastOp that can be folded +/// into its consumer, i.e. `op`. This is effectively a convenience wrapper for +/// `canFoldIntoProducerOp`. +bool hasFoldableTensorCastOperand(Operation *op); + +/// Assuming that `op` contains at least one operand that is a foldable CastOp +/// (i.e. `hasFoldableTensorCastOperand` returns true), calculate the updated +/// operands. +SmallVector +getUpdatedOperandsAfterCastOpFolding(DestinationStyleOpInterface op, + SmallVector &newResTy); + /// Performs folding of any operand of `op` if it comes from a tensor::CastOp /// that can be folded. LogicalResult foldTensorCast(Operation *op); diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 840558a81493f..b8755da8db32e 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -105,8 +105,8 @@ def Tosa_Conv2DOp : Tosa_ConvOp<"conv2d"> { Tosa_Tensor4D:$input, TosaTensorRankOf<[Tosa_Weight], [4]>:$weight, Tosa_Tensor1D:$bias, - Optional:$input_zp, - Optional:$weight_zp, + Optional:$input_zp, + Optional:$weight_zp, Tosa_IntArrayAttr4:$pad, Tosa_IntArrayAttr2:$stride, Tosa_IntArrayAttr2:$dilation, @@ -136,8 +136,8 @@ def Tosa_Conv3DOp : Tosa_ConvOp<"conv3d"> { Tosa_Tensor5D:$input, TosaTensorRankOf<[Tosa_Weight], [5]>:$weight, Tosa_Tensor1D:$bias, - Optional:$input_zp, - Optional:$weight_zp, + Optional:$input_zp, + Optional:$weight_zp, Tosa_IntArrayAttr6:$pad, Tosa_IntArrayAttr3:$stride, Tosa_IntArrayAttr3:$dilation, @@ -168,8 +168,8 @@ def Tosa_DepthwiseConv2DOp : Tosa_ConvOp<"depthwise_conv2d"> { Tosa_Tensor4D:$input, TosaTensorRankOf<[Tosa_Weight], [4]>:$weight, Tosa_Tensor1D:$bias, - Optional:$input_zp, - Optional:$weight_zp, + Optional:$input_zp, + Optional:$weight_zp, Tosa_IntArrayAttr4:$pad, Tosa_IntArrayAttr2:$stride, Tosa_IntArrayAttr2:$dilation, @@ -356,8 +356,8 @@ def Tosa_TransposeConv2DOp : Tosa_ConvOp<"transpose_conv2d"> { Tosa_Tensor4D:$input, TosaTensorRankOf<[Tosa_Weight], [4]>:$weight, Tosa_Tensor1D:$bias, - Optional:$input_zp, - Optional:$weight_zp, + Optional:$input_zp, + Optional:$weight_zp, Tosa_IntArrayAttr4:$out_pad, Tosa_IntArrayAttr2:$stride, Tosa_IntArrayAttr4:$out_shape, @@ -394,10 +394,8 @@ def Tosa_ClampOp : Tosa_ElementwiseUnaryOp<"clamp"> { let arguments = (ins Tosa_Tensor:$input, - I64Attr:$min_int, - I64Attr:$max_int, - Tosa_FloatAttr:$min_fp, - Tosa_FloatAttr:$max_fp, + Tosa_IntOrFloatAttr:$min_val, + Tosa_IntOrFloatAttr:$max_val, DefaultValuedAttr:$nan_mode ); @@ -819,7 +817,8 @@ def Tosa_MulOp : Tosa_Op<"mul", [ let arguments = (ins Tosa_Tensor:$input1, Tosa_Tensor:$input2, - Optional>:$shift + // Apply right shift on i32_t input data only + Tosa_ScalarInt8Tensor:$shift ); let results = (outs @@ -1592,7 +1591,7 @@ def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> { let arguments = (ins Tosa_RankedTensor:$input1, Tosa_Shape:$padding, - Optional:$pad_const, + Optional:$pad_const, OptionalAttr:$input_zp ); diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td index 7aa1f72ec6e17..6457bb8749ee0 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td @@ -93,6 +93,10 @@ def HasNo0Dimensions : And<[ IsRankedTensorTypePred, CPred<"::llvm::all_of(::llvm::cast<::mlir::RankedTensorType>($_self).getShape(), [](auto v) { return v != 0; })">]>; +def AllDimensionsAreSizeOne : And<[ + IsRankedTensorTypePred, + CPred<"::llvm::all_of(::llvm::cast<::mlir::RankedTensorType>($_self).getShape(), [](auto v) { return v == 1; })">]>; + class TosaTensorOf< list allowedTypes, string summary = "tosa-conformant tensor"> : TensorOf], summary>; @@ -109,6 +113,11 @@ class TosaTensorRankOf allowedTypes, list ranks> [HasAnyRankOfPred], !interleave(!foreach(rank, ranks, rank # "D"), "/") # " tensor">; +class TosaScalarTensorOf allowedTypes, list ranks> + : TosaRankedTensorOf, AllDimensionsAreSizeOne], + "tosa-conformant scalar tensor">; + //===----------------------------------------------------------------------===// // Tensor types //===----------------------------------------------------------------------===// @@ -136,8 +145,10 @@ class Tosa_TensorOfOrNone allowedTypes, string description = ""> : // Tensor types with constrained ranks. //===----------------------------------------------------------------------===// -// Rank-0 (scalar) tensor -def Tosa_ScalarTensor : TosaTensorRankOf<[Tosa_AnyNumber], [0]>; +def Tosa_Rank0Tensor : TosaTensorRankOf<[Tosa_AnyNumber], [0]>; + +def Tosa_ScalarTensor : TosaScalarTensorOf<[Tosa_AnyNumber], [1]>; +def Tosa_ScalarInt8Tensor : TosaScalarTensorOf<[Tosa_Int8], [1]>; // We include unranked tensors as a supported type for all possible tosa // Tensors as unranked does not guarantee invalid. If unranked tensors exist @@ -202,6 +213,14 @@ def Tosa_FloatAttr : Attr($_self)">, let returnType = [{ ::mlir::APFloat }]; } +def Tosa_IntegerAttr : Attr($_self)">, + "arbitrary integer attribute"> { + let storageType = [{ ::mlir::IntegerAttr }]; + let returnType = [{ ::llvm::APInt }]; +} + +def Tosa_IntOrFloatAttr : AnyAttrOf<[Tosa_IntegerAttr, Tosa_FloatAttr]>; + //===----------------------------------------------------------------------===// // Iterable attributes. //===----------------------------------------------------------------------===// @@ -288,9 +307,4 @@ def Rank1TosaShape : TosaShapeOfRank<1>; def Rank2TosaShape : TosaShapeOfRank<2>; def Rank4TosaShape : TosaShapeOfRank<4>; -// NOTE: Tosa_ScalarTensor is currently defined as rank-0. If and when this -// becomes rank-1 it can be used in place of Tosa_ZeroPointTensor and the -// following def can be removed. -def Tosa_ZeroPointTensor : TosaTensorRankOf<[Tosa_AnyNumber], [1]>; - #endif // TOSA_TYPES_BASE diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index 835c006356342..c821e7b1527b4 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -373,8 +373,9 @@ def Vector_BroadcastOp : The source operand is duplicated over all the missing leading dimensions and stretched over the trailing dimensions where the source has a non-equal - dimension of 1. These rules imply that any scalar broadcast (k=0) to any - shaped vector with the same element type is always legal. + dimension of 1 (stretching a trailing dimension is also referred to as + "dim-1" broadcasting). These rules imply that any scalar broadcast (k=0) to + any shaped vector with the same element type is always legal. Example: @@ -396,7 +397,7 @@ def Vector_BroadcastOp : /// Broadcast `value` to a vector of `dstShape`, knowing that exactly the /// `broadcastedDims` dimensions in the dstShape are broadcasted. - /// This requires (and asserts) that the broadcast is free of dim-1 + /// This requires (and asserts) that the broadcast is free of "dim-1" /// broadcasting. /// Since vector.broadcast only allows expanding leading dimensions, an extra /// vector.transpose may be inserted to make the broadcast possible. diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h index 0da82825c8287..33bc89279c08c 100644 --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -23,6 +23,7 @@ #include "mlir/Dialect/Arith/Transforms/BufferDeallocationOpInterfaceImpl.h" #include "mlir/Dialect/Arith/Transforms/BufferViewFlowOpInterfaceImpl.h" #include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h" +#include "mlir/Dialect/Arith/Transforms/ShardingInterfaceImpl.h" #include "mlir/Dialect/ArmNeon/ArmNeonDialect.h" #include "mlir/Dialect/ArmSME/IR/ArmSME.h" #include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h" @@ -158,6 +159,7 @@ inline void registerAllDialects(DialectRegistry ®istry) { arith::registerBufferDeallocationOpInterfaceExternalModels(registry); arith::registerBufferizableOpInterfaceExternalModels(registry); arith::registerBufferViewFlowOpInterfaceExternalModels(registry); + arith::registerShardingInterfaceExternalModels(registry); arith::registerValueBoundsOpInterfaceExternalModels(registry); bufferization::func_ext::registerBufferizableOpInterfaceExternalModels( registry); diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp index 9e9411e5ede12..722f4df18e981 100644 --- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp @@ -152,7 +152,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments( /// on a LoopLikeInterface return the lower/upper bound for that result if /// possible. auto getLoopBoundFromFold = [&](std::optional loopBound, - Type boundType, bool getUpper) { + Type boundType, Block *block, bool getUpper) { unsigned int width = ConstantIntRanges::getStorageBitwidth(boundType); if (loopBound.has_value()) { if (auto attr = dyn_cast(*loopBound)) { @@ -160,7 +160,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments( return bound.getValue(); } else if (auto value = llvm::dyn_cast_if_present(*loopBound)) { const IntegerValueRangeLattice *lattice = - getLatticeElementFor(getProgramPointAfter(op), value); + getLatticeElementFor(getProgramPointBefore(block), value); if (lattice != nullptr && !lattice->getValue().isUninitialized()) return getUpper ? lattice->getValue().getValue().smax() : lattice->getValue().getValue().smin(); @@ -180,16 +180,17 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments( return SparseForwardDataFlowAnalysis ::visitNonControlFlowArguments( op, successor, argLattices, firstIndex); } + Block *block = iv->getParentBlock(); std::optional lowerBound = loop.getSingleLowerBound(); std::optional upperBound = loop.getSingleUpperBound(); std::optional step = loop.getSingleStep(); - APInt min = getLoopBoundFromFold(lowerBound, iv->getType(), + APInt min = getLoopBoundFromFold(lowerBound, iv->getType(), block, /*getUpper=*/false); - APInt max = getLoopBoundFromFold(upperBound, iv->getType(), + APInt max = getLoopBoundFromFold(upperBound, iv->getType(), block, /*getUpper=*/true); // Assume positivity for uniscoverable steps by way of getUpper = true. APInt stepVal = - getLoopBoundFromFold(step, iv->getType(), /*getUpper=*/true); + getLoopBoundFromFold(step, iv->getType(), block, /*getUpper=*/true); if (stepVal.isNegative()) { std::swap(min, max); diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 47a85c2a486fd..827db5f3eba84 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -299,7 +299,7 @@ struct PyAttrBuilderMap { return *builder; } static void dunderSetItemNamed(const std::string &attributeKind, - nb::callable func, bool replace) { + nb::callable func, bool replace) { PyGlobals::get().registerAttributeBuilder(attributeKind, std::move(func), replace); } @@ -3049,6 +3049,18 @@ void mlir::python::populateIRCore(nb::module_ &m) { }, nb::arg("asm"), nb::arg("context").none() = nb::none(), kModuleParseDocstring) + .def_static( + "parseFile", + [](const std::string &path, DefaultingPyMlirContext context) { + PyMlirContext::ErrorCapture errors(context->getRef()); + MlirModule module = mlirModuleCreateParseFromFile( + context->get(), toMlirStringRef(path)); + if (mlirModuleIsNull(module)) + throw MLIRError("Unable to parse module assembly", errors.take()); + return PyModule::forModule(module).releaseObject(); + }, + nb::arg("path"), nb::arg("context").none() = nb::none(), + kModuleParseDocstring) .def_static( "create", [](DefaultingPyLocation loc) { diff --git a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp index 2b4697434717d..cc5aaed416512 100644 --- a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp +++ b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp @@ -613,6 +613,9 @@ class RawEmitterOstream : public raw_ostream { } // namespace void EncodingEmitter::writeTo(raw_ostream &os) const { + // Reserve space in the ostream for the encoded contents. + os.reserveExtraSpace(size()); + for (auto &prevResult : prevResultList) os.write((const char *)prevResult.data(), prevResult.size()); os.write((const char *)currentResult.data(), currentResult.size()); diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp index f27af0ca9a2c7..999e8cbda1295 100644 --- a/mlir/lib/CAPI/IR/IR.cpp +++ b/mlir/lib/CAPI/IR/IR.cpp @@ -22,6 +22,7 @@ #include "mlir/IR/Location.h" #include "mlir/IR/Operation.h" #include "mlir/IR/OperationSupport.h" +#include "mlir/IR/OwningOpRef.h" #include "mlir/IR/Types.h" #include "mlir/IR/Value.h" #include "mlir/IR/Verifier.h" @@ -328,6 +329,15 @@ MlirModule mlirModuleCreateParse(MlirContext context, MlirStringRef module) { return MlirModule{owning.release().getOperation()}; } +MlirModule mlirModuleCreateParseFromFile(MlirContext context, + MlirStringRef fileName) { + OwningOpRef owning = + parseSourceFile(unwrap(fileName), unwrap(context)); + if (!owning) + return MlirModule{nullptr}; + return MlirModule{owning.release().getOperation()}; +} + MlirContext mlirModuleGetContext(MlirModule module) { return wrap(unwrap(module).getContext()); } diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp index 754ed89814293..ced18a48766bf 100644 --- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp +++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp @@ -54,6 +54,25 @@ struct ConstrainedVectorConvertToLLVMPattern } }; +/// No-op bitcast. Propagate type input arg if converted source and dest types +/// are the same. +struct IdentityBitcastLowering final + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(arith::BitcastOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + Value src = adaptor.getIn(); + Type resultType = getTypeConverter()->convertType(op.getType()); + if (src.getType() != resultType) + return rewriter.notifyMatchFailure(op, "Types are different"); + + rewriter.replaceOp(op, src); + return success(); + } +}; + //===----------------------------------------------------------------------===// // Straightforward Op Lowerings //===----------------------------------------------------------------------===// @@ -524,6 +543,12 @@ void mlir::arith::registerConvertArithToLLVMInterface( void mlir::arith::populateArithToLLVMConversionPatterns( const LLVMTypeConverter &converter, RewritePatternSet &patterns) { + + // Set a higher pattern benefit for IdentityBitcastLowering so it will run + // before BitcastOpLowering. + patterns.add(converter, patterns.getContext(), + /*patternBenefit*/ 10); + // clang-format off patterns.add< AddFOpLowering, diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 0246d9019368a..d849c782bf08b 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -92,22 +92,27 @@ static Value createLinalgBodyCalculationForElementwiseOp( // tosa::MulOp if (isa(op)) { auto shift_val = cast(op).getShift(); + ElementsAttr shift_elem; + if (!shift_val.getImpl() || + !matchPattern(shift_val, m_Constant(&shift_elem))) { + (void)rewriter.notifyMatchFailure(op, "shift value of mul not found"); + } + + int32_t shift = shift_elem.getValues()[0].getInt(); if (isa(elementTy)) { + if (shift != 0) { + (void)rewriter.notifyMatchFailure(op, + "Cannot have shift value for float"); + return nullptr; + } return rewriter.create(loc, resultTypes, args[0], args[1]); } if (isa(elementTy)) { - int32_t shift = 0; - ElementsAttr shift_elem; - if (shift_val.getImpl() && - matchPattern(shift_val, m_Constant(&shift_elem))) { - // Explicit shift is set. - shift = shift_elem.getValues()[0].getInt(); - } - Value a = args[0]; Value b = args[1]; + if (shift > 0) { auto shiftConst = rewriter.create(loc, shift, /*bitwidth=*/8); @@ -389,8 +394,8 @@ static Value createLinalgBodyCalculationForElementwiseOp( // tosa::ClampOp if (isa(op) && isa(elementTy)) { bool losesInfo = false; - APFloat minApf = cast(op->getAttr("min_fp")).getValue(); - APFloat maxApf = cast(op->getAttr("max_fp")).getValue(); + APFloat minApf = cast(op->getAttr("min_val")).getValue(); + APFloat maxApf = cast(op->getAttr("max_val")).getValue(); minApf.convert(cast(elementTy).getFloatSemantics(), APFloat::rmNearestTiesToEven, &losesInfo); maxApf.convert(cast(elementTy).getFloatSemantics(), @@ -405,9 +410,9 @@ static Value createLinalgBodyCalculationForElementwiseOp( if (isa(op) && isa(elementTy)) { auto intTy = cast(elementTy); int64_t min = - cast(op->getAttr("min_int")).getValue().getSExtValue(); + cast(op->getAttr("min_val")).getValue().getSExtValue(); int64_t max = - cast(op->getAttr("max_int")).getValue().getSExtValue(); + cast(op->getAttr("max_val")).getValue().getSExtValue(); int64_t minRepresentable = std::numeric_limits::min(); int64_t maxRepresentable = std::numeric_limits::max(); diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp index b38dd8effe669..30019447d94e8 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp @@ -759,6 +759,9 @@ struct GreedyFusion { const DenseSet &srcEscapingMemRefs, unsigned producerId, unsigned consumerId, bool removeSrcNode) { + // We can't generate private memrefs if their size can't be computed. + if (!getMemRefIntOrFloatEltSizeInBytes(cast(memref.getType()))) + return false; const Node *consumerNode = mdg->getNode(consumerId); // If `memref` is an escaping one, do not create a private memref // for the below scenarios, since doing so will leave the escaping @@ -1162,24 +1165,48 @@ struct GreedyFusion { } assert(bestDstLoopDepth > 0 && "Unexpected loop fusion depth"); - assert(!depthSliceUnions[bestDstLoopDepth - 1].isEmpty() && + + const ComputationSliceState &bestSlice = + depthSliceUnions[bestDstLoopDepth - 1]; + assert(!bestSlice.isEmpty() && "Fusion depth has no computed slice union"); + + // Do not perform sibling fusion if it isn't maximal. We always remove the + // sibling node and as such fusion shouldn't be performed if a part of the + // slice is used in the destination. + auto isMaximal = bestSlice.isMaximal(); + if (!isMaximal.value_or(false)) { + LLVM_DEBUG(llvm::dbgs() + << "Slice isn't maximal; not performing sibling fusion.\n"); + continue; + } + // Check if source loop is being inserted in the innermost // destination loop. Based on this, the fused loop may be optimized // further inside `fuseLoops`. bool isInnermostInsertion = (bestDstLoopDepth == dstLoopDepthTest); // Fuse computation slice of 'sibLoopNest' into 'dstLoopNest'. - affine::fuseLoops(sibAffineForOp, dstAffineForOp, - depthSliceUnions[bestDstLoopDepth - 1], + affine::fuseLoops(sibAffineForOp, dstAffineForOp, bestSlice, isInnermostInsertion); auto dstForInst = cast(dstNode->op); // Update operation position of fused loop nest (if needed). - if (insertPointInst != dstForInst) { + if (insertPointInst != dstForInst) dstForInst->moveBefore(insertPointInst); - } + + LLVM_DEBUG(llvm::dbgs() + << "Fused sibling nest " << sibId << " into destination nest " + << dstNode->id << " at depth " << bestDstLoopDepth << ":\n" + << dstAffineForOp << "\n"); + // Update data dependence graph state post fusion. updateStateAfterSiblingFusion(sibNode, dstNode); + + // Remove old sibling loop nest. + // Get op before we invalidate the MDG node. + Operation *op = sibNode->op; + mdg->removeNode(sibNode->id); + op->erase(); } } @@ -1321,13 +1348,6 @@ struct GreedyFusion { mdg->addToNode(dstNode->id, dstLoopCollector.loadOpInsts, dstLoopCollector.storeOpInsts, dstLoopCollector.memrefLoads, dstLoopCollector.memrefStores, dstLoopCollector.memrefFrees); - // Remove old sibling loop nest if it no longer has outgoing dependence - // edges, and it does not write to a memref which escapes the block. - if (mdg->getOutEdgeCount(sibNode->id) == 0) { - Operation *op = sibNode->op; - mdg->removeNode(sibNode->id); - op->erase(); - } } // Clean up any allocs with no users. diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp index 57df7ada91654..7ff77968c61ad 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp @@ -82,7 +82,7 @@ static bool isInnermostAffineForOp(AffineForOp op) { } /// Gathers loops that have no affine.for's nested within. -static void gatherInnermostLoops(func::FuncOp f, +static void gatherInnermostLoops(FunctionOpInterface f, SmallVectorImpl &loops) { f.walk([&](AffineForOp forOp) { if (isInnermostAffineForOp(forOp)) @@ -91,7 +91,7 @@ static void gatherInnermostLoops(func::FuncOp f, } void LoopUnroll::runOnOperation() { - func::FuncOp func = getOperation(); + FunctionOpInterface func = getOperation(); if (func.isExternal()) return; @@ -100,8 +100,8 @@ void LoopUnroll::runOnOperation() { SmallVector loops; // Gathers all loops with trip count <= minTripCount. Do a post order walk - // so that loops are gathered from innermost to outermost (or else unrolling - // an outer one may delete gathered inner ones). + // so that loops are gathered from innermost to outermost (or else + // unrolling an outer one may delete gathered inner ones). getOperation().walk([&](AffineForOp forOp) { std::optional tripCount = getConstantTripCount(forOp); if (tripCount && *tripCount <= unrollFullThreshold) @@ -145,7 +145,8 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) { cleanUpUnroll); } -std::unique_ptr> mlir::affine::createLoopUnrollPass( +std::unique_ptr> +mlir::affine::createLoopUnrollPass( int unrollFactor, bool unrollUpToFactor, bool unrollFull, const std::function &getUnrollFactor) { return std::make_unique( diff --git a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt index 6149b35befe7d..f96bda603baa6 100644 --- a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt @@ -8,6 +8,7 @@ add_mlir_dialect_library(MLIRArithTransforms ExpandOps.cpp IntRangeOptimizations.cpp ReifyValueBounds.cpp + ShardingInterfaceImpl.cpp UnsignedWhenEquivalent.cpp ADDITIONAL_HEADER_DIRS @@ -26,7 +27,9 @@ add_mlir_dialect_library(MLIRArithTransforms MLIRInferIntRangeInterface MLIRIR MLIRMemRefDialect + MLIRMeshDialect MLIRPass + MLIRShardingInterface MLIRTensorDialect MLIRTransforms MLIRTransformUtils diff --git a/mlir/lib/Dialect/Arith/Transforms/ShardingInterfaceImpl.cpp b/mlir/lib/Dialect/Arith/Transforms/ShardingInterfaceImpl.cpp new file mode 100644 index 0000000000000..62d137a4cfb0e --- /dev/null +++ b/mlir/lib/Dialect/Arith/Transforms/ShardingInterfaceImpl.cpp @@ -0,0 +1,105 @@ +//===- ShardingInterfaceImpl.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Mesh/Interfaces/ShardingInterfaceImpl.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Arith/Transforms/ShardingInterfaceImpl.h" +#include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h" +#include "mlir/IR/DialectRegistry.h" +#include "llvm/Support/Debug.h" + +using namespace mlir; +using namespace mlir::arith; +using namespace mlir::mesh; + +namespace { + +// Sharding of arith.constant +// RankedTensor constants can be sharded like any other tensor. +// %cst = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> +// %sharding = mesh.sharding @mesh4x4 split_axes = [[0]] : !mesh.sharding +// Scalar constants are always replicated and need no sharding annotation. + +struct ConstantShardingInterface + : public ShardingInterface::ExternalModel { + SmallVector getLoopIteratorTypes(Operation *op) const { + auto ndims = 0; + if (auto type = dyn_cast(op->getResult(0).getType())) { + ndims = type.getRank(); + } + return SmallVector(ndims, + utils::IteratorType::parallel); + } + + SmallVector getIndexingMaps(Operation *op) const { + if (auto type = dyn_cast(op->getResult(0).getType())) { + return SmallVector(1, {AffineMap::getMultiDimIdentityMap( + type.getRank(), op->getContext())}); + } + return {}; + } + + // Indicate failure if no result sharding exists. + // Otherwise mirror result sharding if it is a tensor constant. + // Otherwise return replication option. + FailureOr + getShardingOption(Operation *op, ArrayRef operandShardings, + ArrayRef resultShardings) const { + assert(resultShardings.size() == 1 && + "Expecting exactly one result sharding for arith.constant"); + auto resultSharding = resultShardings[0]; + if (!resultSharding) { + return failure(); + } + if (auto type = dyn_cast(op->getResult(0).getType())) { + ShardingArray axesArray(resultSharding.getSplitAxes().size()); + for (auto [i, axes] : llvm::enumerate(resultSharding.getSplitAxes())) { + axesArray[i].append(axes.asArrayRef().begin(), axes.asArrayRef().end()); + } + return ShardingOption(axesArray, resultSharding.getMeshAttr()); + } + return ShardingOption({}, resultSharding.getMeshAttr()); + } + + LogicalResult spmdize(Operation *op, ArrayRef spmdizedOperands, + ArrayRef operandShardings, + ArrayRef resultShardings, + IRMapping &spmdizationMap, + SymbolTableCollection &symbolTable, + OpBuilder &builder) const { + auto cOp = cast(op); + if (auto value = dyn_cast(cOp.getValue())) { + if (!value.isSplat() || !resultShardings[0]) { + // Currently non-splat constants are not supported. + return failure(); + } + auto sharding = resultShardings[0]; + auto newType = cast(shardType( + cOp.getType(), getMesh(op, sharding.getMeshAttr(), symbolTable), + sharding)); + auto newValue = value.resizeSplat(newType); + auto newOp = builder.create(op->getLoc(), newType, newValue); + spmdizationMap.map(op->getResult(0), newOp.getResult()); + spmdizationMap.map(op, newOp.getOperation()); + } else { + // `clone` will populate the mapping of old to new results. + (void)builder.clone(*op, spmdizationMap); + } + return success(); + } +}; +} // namespace + +void mlir::arith::registerShardingInterfaceExternalModels( + DialectRegistry ®istry) { + + registry.addExtension(+[](MLIRContext *ctx, ArithDialect *dialect) { + ConstantOp::template attachInterface(*ctx); + }); +} diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocation.cpp deleted file mode 100644 index a0a81d4add712..0000000000000 --- a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocation.cpp +++ /dev/null @@ -1,693 +0,0 @@ -//===- BufferDeallocation.cpp - the impl for buffer deallocation ----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements logic for computing correct alloc and dealloc positions. -// Furthermore, buffer deallocation also adds required new clone operations to -// ensure that all buffers are deallocated. The main class is the -// BufferDeallocationPass class that implements the underlying algorithm. In -// order to put allocations and deallocations at safe positions, it is -// significantly important to put them into the correct blocks. However, the -// liveness analysis does not pay attention to aliases, which can occur due to -// branches (and their associated block arguments) in general. For this purpose, -// BufferDeallocation firstly finds all possible aliases for a single value -// (using the BufferViewFlowAnalysis class). Consider the following example: -// -// ^bb0(%arg0): -// cf.cond_br %cond, ^bb1, ^bb2 -// ^bb1: -// cf.br ^exit(%arg0) -// ^bb2: -// %new_value = ... -// cf.br ^exit(%new_value) -// ^exit(%arg1): -// return %arg1; -// -// We should place the dealloc for %new_value in exit. However, we have to free -// the buffer in the same block, because it cannot be freed in the post -// dominator. However, this requires a new clone buffer for %arg1 that will -// contain the actual contents. Using the class BufferViewFlowAnalysis, we -// will find out that %new_value has a potential alias %arg1. In order to find -// the dealloc position we have to find all potential aliases, iterate over -// their uses and find the common post-dominator block (note that additional -// clones and buffers remove potential aliases and will influence the placement -// of the deallocs). In all cases, the computed block can be safely used to free -// the %new_value buffer (may be exit or bb2) as it will die and we can use -// liveness information to determine the exact operation after which we have to -// insert the dealloc. However, the algorithm supports introducing clone buffers -// and placing deallocs in safe locations to ensure that all buffers will be -// freed in the end. -// -// TODO: -// The current implementation does not support explicit-control-flow loops and -// the resulting code will be invalid with respect to program semantics. -// However, structured control-flow loops are fully supported. Furthermore, it -// doesn't accept functions which return buffers already. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Bufferization/Transforms/Passes.h" - -#include "mlir/Dialect/Bufferization/IR/AllocationOpInterface.h" -#include "mlir/Dialect/Bufferization/IR/Bufferization.h" -#include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "llvm/ADT/SetOperations.h" - -namespace mlir { -namespace bufferization { -#define GEN_PASS_DEF_BUFFERDEALLOCATION -#include "mlir/Dialect/Bufferization/Transforms/Passes.h.inc" -} // namespace bufferization -} // namespace mlir - -using namespace mlir; -using namespace mlir::bufferization; - -/// Walks over all immediate return-like terminators in the given region. -static LogicalResult walkReturnOperations( - Region *region, - llvm::function_ref func) { - for (Block &block : *region) { - Operation *terminator = block.getTerminator(); - // Skip non region-return-like terminators. - if (auto regionTerminator = - dyn_cast(terminator)) { - if (failed(func(regionTerminator))) - return failure(); - } - } - return success(); -} - -/// Checks if all operations that have at least one attached region implement -/// the RegionBranchOpInterface. This is not required in edge cases, where we -/// have a single attached region and the parent operation has no results. -static bool validateSupportedControlFlow(Operation *op) { - WalkResult result = op->walk([&](Operation *operation) { - // Only check ops that are inside a function. - if (!operation->getParentOfType()) - return WalkResult::advance(); - - auto regions = operation->getRegions(); - // Walk over all operations in a region and check if the operation has at - // least one region and implements the RegionBranchOpInterface. If there - // is an operation that does not fulfill this condition, we cannot apply - // the deallocation steps. Furthermore, we accept cases, where we have a - // region that returns no results, since, in that case, the intra-region - // control flow does not affect the transformation. - size_t size = regions.size(); - if (((size == 1 && !operation->getResults().empty()) || size > 1) && - !dyn_cast(operation)) { - operation->emitError("All operations with attached regions need to " - "implement the RegionBranchOpInterface."); - } - - return WalkResult::advance(); - }); - return !result.wasSkipped(); -} - -namespace { - -//===----------------------------------------------------------------------===// -// Backedges analysis -//===----------------------------------------------------------------------===// - -/// A straight-forward program analysis which detects loop backedges induced by -/// explicit control flow. -class Backedges { -public: - using BlockSetT = SmallPtrSet; - using BackedgeSetT = llvm::DenseSet>; - -public: - /// Constructs a new backedges analysis using the op provided. - Backedges(Operation *op) { recurse(op); } - - /// Returns the number of backedges formed by explicit control flow. - size_t size() const { return edgeSet.size(); } - - /// Returns the start iterator to loop over all backedges. - BackedgeSetT::const_iterator begin() const { return edgeSet.begin(); } - - /// Returns the end iterator to loop over all backedges. - BackedgeSetT::const_iterator end() const { return edgeSet.end(); } - -private: - /// Enters the current block and inserts a backedge into the `edgeSet` if we - /// have already visited the current block. The inserted edge links the given - /// `predecessor` with the `current` block. - bool enter(Block ¤t, Block *predecessor) { - bool inserted = visited.insert(¤t).second; - if (!inserted) - edgeSet.insert(std::make_pair(predecessor, ¤t)); - return inserted; - } - - /// Leaves the current block. - void exit(Block ¤t) { visited.erase(¤t); } - - /// Recurses into the given operation while taking all attached regions into - /// account. - void recurse(Operation *op) { - Block *current = op->getBlock(); - // If the current op implements the `BranchOpInterface`, there can be - // cycles in the scope of all successor blocks. - if (isa(op)) { - for (Block *succ : current->getSuccessors()) - recurse(*succ, current); - } - // Recurse into all distinct regions and check for explicit control-flow - // loops. - for (Region ®ion : op->getRegions()) { - if (!region.empty()) - recurse(region.front(), current); - } - } - - /// Recurses into explicit control-flow structures that are given by - /// the successor relation defined on the block level. - void recurse(Block &block, Block *predecessor) { - // Try to enter the current block. If this is not possible, we are - // currently processing this block and can safely return here. - if (!enter(block, predecessor)) - return; - - // Recurse into all operations and successor blocks. - for (Operation &op : block.getOperations()) - recurse(&op); - - // Leave the current block. - exit(block); - } - - /// Stores all blocks that are currently visited and on the processing stack. - BlockSetT visited; - - /// Stores all backedges in the format (source, target). - BackedgeSetT edgeSet; -}; - -//===----------------------------------------------------------------------===// -// BufferDeallocation -//===----------------------------------------------------------------------===// - -/// The buffer deallocation transformation which ensures that all allocs in the -/// program have a corresponding de-allocation. As a side-effect, it might also -/// introduce clones that in turn leads to additional deallocations. -class BufferDeallocation : public BufferPlacementTransformationBase { -public: - using AliasAllocationMapT = - llvm::DenseMap; - - BufferDeallocation(Operation *op) - : BufferPlacementTransformationBase(op), dominators(op), - postDominators(op) {} - - /// Checks if all allocation operations either provide an already existing - /// deallocation operation or implement the AllocationOpInterface. In - /// addition, this method initializes the internal alias to - /// AllocationOpInterface mapping in order to get compatible - /// AllocationOpInterface implementations for aliases. - LogicalResult prepare() { - for (const BufferPlacementAllocs::AllocEntry &entry : allocs) { - // Get the defining allocation operation. - Value alloc = std::get<0>(entry); - auto allocationInterface = - alloc.getDefiningOp(); - // If there is no existing deallocation operation and no implementation of - // the AllocationOpInterface, we cannot apply the BufferDeallocation pass. - if (!std::get<1>(entry) && !allocationInterface) { - return alloc.getDefiningOp()->emitError( - "Allocation is not deallocated explicitly nor does the operation " - "implement the AllocationOpInterface."); - } - - // Register the current allocation interface implementation. - aliasToAllocations[alloc] = allocationInterface; - - // Get the alias information for the current allocation node. - for (Value alias : aliases.resolve(alloc)) { - // TODO: check for incompatible implementations of the - // AllocationOpInterface. This could be realized by promoting the - // AllocationOpInterface to a DialectInterface. - aliasToAllocations[alias] = allocationInterface; - } - } - return success(); - } - - /// Performs the actual placement/creation of all temporary clone and dealloc - /// nodes. - LogicalResult deallocate() { - // Add additional clones that are required. - if (failed(introduceClones())) - return failure(); - - // Place deallocations for all allocation entries. - return placeDeallocs(); - } - -private: - /// Introduces required clone operations to avoid memory leaks. - LogicalResult introduceClones() { - // Initialize the set of values that require a dedicated memory free - // operation since their operands cannot be safely deallocated in a post - // dominator. - SetVector valuesToFree; - llvm::SmallDenseSet> visitedValues; - SmallVector, 8> toProcess; - - // Check dominance relation for proper dominance properties. If the given - // value node does not dominate an alias, we will have to create a clone in - // order to free all buffers that can potentially leak into a post - // dominator. - auto findUnsafeValues = [&](Value source, Block *definingBlock) { - auto it = aliases.find(source); - if (it == aliases.end()) - return; - for (Value value : it->second) { - if (valuesToFree.count(value) > 0) - continue; - Block *parentBlock = value.getParentBlock(); - // Check whether we have to free this particular block argument or - // generic value. We have to free the current alias if it is either - // defined in a non-dominated block or it is defined in the same block - // but the current value is not dominated by the source value. - if (!dominators.dominates(definingBlock, parentBlock) || - (definingBlock == parentBlock && isa(value))) { - toProcess.emplace_back(value, parentBlock); - valuesToFree.insert(value); - } else if (visitedValues.insert(std::make_tuple(value, definingBlock)) - .second) - toProcess.emplace_back(value, definingBlock); - } - }; - - // Detect possibly unsafe aliases starting from all allocations. - for (BufferPlacementAllocs::AllocEntry &entry : allocs) { - Value allocValue = std::get<0>(entry); - findUnsafeValues(allocValue, allocValue.getDefiningOp()->getBlock()); - } - // Try to find block arguments that require an explicit free operation - // until we reach a fix point. - while (!toProcess.empty()) { - auto current = toProcess.pop_back_val(); - findUnsafeValues(std::get<0>(current), std::get<1>(current)); - } - - // Update buffer aliases to ensure that we free all buffers and block - // arguments at the correct locations. - aliases.remove(valuesToFree); - - // Add new allocs and additional clone operations. - for (Value value : valuesToFree) { - if (failed(isa(value) - ? introduceBlockArgCopy(cast(value)) - : introduceValueCopyForRegionResult(value))) - return failure(); - - // Register the value to require a final dealloc. Note that we do not have - // to assign a block here since we do not want to move the allocation node - // to another location. - allocs.registerAlloc(std::make_tuple(value, nullptr)); - } - return success(); - } - - /// Introduces temporary clones in all predecessors and copies the source - /// values into the newly allocated buffers. - LogicalResult introduceBlockArgCopy(BlockArgument blockArg) { - // Allocate a buffer for the current block argument in the block of - // the associated value (which will be a predecessor block by - // definition). - Block *block = blockArg.getOwner(); - for (auto it = block->pred_begin(), e = block->pred_end(); it != e; ++it) { - // Get the terminator and the value that will be passed to our - // argument. - Operation *terminator = (*it)->getTerminator(); - auto branchInterface = cast(terminator); - SuccessorOperands operands = - branchInterface.getSuccessorOperands(it.getSuccessorIndex()); - - // Query the associated source value. - Value sourceValue = operands[blockArg.getArgNumber()]; - if (!sourceValue) { - return failure(); - } - // Wire new clone and successor operand. - // Create a new clone at the current location of the terminator. - auto clone = introduceCloneBuffers(sourceValue, terminator); - if (failed(clone)) - return failure(); - operands.slice(blockArg.getArgNumber(), 1).assign(*clone); - } - - // Check whether the block argument has implicitly defined predecessors via - // the RegionBranchOpInterface. This can be the case if the current block - // argument belongs to the first block in a region and the parent operation - // implements the RegionBranchOpInterface. - Region *argRegion = block->getParent(); - Operation *parentOp = argRegion->getParentOp(); - RegionBranchOpInterface regionInterface; - if (&argRegion->front() != block || - !(regionInterface = dyn_cast(parentOp))) - return success(); - - if (failed(introduceClonesForRegionSuccessors( - regionInterface, argRegion->getParentOp()->getRegions(), blockArg, - [&](RegionSuccessor &successorRegion) { - // Find a predecessor of our argRegion. - return successorRegion.getSuccessor() == argRegion; - }))) - return failure(); - - // Check whether the block argument belongs to an entry region of the - // parent operation. In this case, we have to introduce an additional clone - // for buffer that is passed to the argument. - SmallVector successorRegions; - regionInterface.getSuccessorRegions(/*point=*/RegionBranchPoint::parent(), - successorRegions); - auto *it = - llvm::find_if(successorRegions, [&](RegionSuccessor &successorRegion) { - return successorRegion.getSuccessor() == argRegion; - }); - if (it == successorRegions.end()) - return success(); - - // Determine the actual operand to introduce a clone for and rewire the - // operand to point to the clone instead. - auto operands = regionInterface.getEntrySuccessorOperands(argRegion); - size_t operandIndex = - llvm::find(it->getSuccessorInputs(), blockArg).getIndex() + - operands.getBeginOperandIndex(); - Value operand = parentOp->getOperand(operandIndex); - assert(operand == - operands[operandIndex - operands.getBeginOperandIndex()] && - "region interface operands don't match parentOp operands"); - auto clone = introduceCloneBuffers(operand, parentOp); - if (failed(clone)) - return failure(); - - parentOp->setOperand(operandIndex, *clone); - return success(); - } - - /// Introduces temporary clones in front of all associated nested-region - /// terminators and copies the source values into the newly allocated buffers. - LogicalResult introduceValueCopyForRegionResult(Value value) { - // Get the actual result index in the scope of the parent terminator. - Operation *operation = value.getDefiningOp(); - auto regionInterface = cast(operation); - // Filter successors that return to the parent operation. - auto regionPredicate = [&](RegionSuccessor &successorRegion) { - // If the RegionSuccessor has no associated successor, it will return to - // its parent operation. - return !successorRegion.getSuccessor(); - }; - // Introduce a clone for all region "results" that are returned to the - // parent operation. This is required since the parent's result value has - // been considered critical. Therefore, the algorithm assumes that a clone - // of a previously allocated buffer is returned by the operation (like in - // the case of a block argument). - return introduceClonesForRegionSuccessors( - regionInterface, operation->getRegions(), value, regionPredicate); - } - - /// Introduces buffer clones for all terminators in the given regions. The - /// regionPredicate is applied to every successor region in order to restrict - /// the clones to specific regions. - template - LogicalResult introduceClonesForRegionSuccessors( - RegionBranchOpInterface regionInterface, MutableArrayRef regions, - Value argValue, const TPredicate ®ionPredicate) { - for (Region ®ion : regions) { - // Query the regionInterface to get all successor regions of the current - // one. - SmallVector successorRegions; - regionInterface.getSuccessorRegions(region, successorRegions); - // Try to find a matching region successor. - RegionSuccessor *regionSuccessor = - llvm::find_if(successorRegions, regionPredicate); - if (regionSuccessor == successorRegions.end()) - continue; - // Get the operand index in the context of the current successor input - // bindings. - size_t operandIndex = - llvm::find(regionSuccessor->getSuccessorInputs(), argValue) - .getIndex(); - - // Iterate over all immediate terminator operations to introduce - // new buffer allocations. Thereby, the appropriate terminator operand - // will be adjusted to point to the newly allocated buffer instead. - if (failed(walkReturnOperations( - ®ion, [&](RegionBranchTerminatorOpInterface terminator) { - // Get the actual mutable operands for this terminator op. - auto terminatorOperands = - terminator.getMutableSuccessorOperands(*regionSuccessor); - // Extract the source value from the current terminator. - // This conversion needs to exist on a separate line due to a - // bug in GCC conversion analysis. - OperandRange immutableTerminatorOperands = terminatorOperands; - Value sourceValue = immutableTerminatorOperands[operandIndex]; - // Create a new clone at the current location of the terminator. - auto clone = introduceCloneBuffers(sourceValue, terminator); - if (failed(clone)) - return failure(); - // Wire clone and terminator operand. - terminatorOperands.slice(operandIndex, 1).assign(*clone); - return success(); - }))) - return failure(); - } - return success(); - } - - /// Creates a new memory allocation for the given source value and clones - /// its content into the newly allocated buffer. The terminator operation is - /// used to insert the clone operation at the right place. - FailureOr introduceCloneBuffers(Value sourceValue, - Operation *terminator) { - // Avoid multiple clones of the same source value. This can happen in the - // presence of loops when a branch acts as a backedge while also having - // another successor that returns to its parent operation. Note: that - // copying copied buffers can introduce memory leaks since the invariant of - // BufferDeallocation assumes that a buffer will be only cloned once into a - // temporary buffer. Hence, the construction of clone chains introduces - // additional allocations that are not tracked automatically by the - // algorithm. - if (clonedValues.contains(sourceValue)) - return sourceValue; - // Create a new clone operation that copies the contents of the old - // buffer to the new one. - auto clone = buildClone(terminator, sourceValue); - if (succeeded(clone)) { - // Remember the clone of original source value. - clonedValues.insert(*clone); - } - return clone; - } - - /// Finds correct dealloc positions according to the algorithm described at - /// the top of the file for all alloc nodes and block arguments that can be - /// handled by this analysis. - LogicalResult placeDeallocs() { - // Move or insert deallocs using the previously computed information. - // These deallocations will be linked to their associated allocation nodes - // since they don't have any aliases that can (potentially) increase their - // liveness. - for (const BufferPlacementAllocs::AllocEntry &entry : allocs) { - Value alloc = std::get<0>(entry); - auto aliasesSet = aliases.resolve(alloc); - assert(!aliasesSet.empty() && "must contain at least one alias"); - - // Determine the actual block to place the dealloc and get liveness - // information. - Block *placementBlock = - findCommonDominator(alloc, aliasesSet, postDominators); - const LivenessBlockInfo *livenessInfo = - liveness.getLiveness(placementBlock); - - // We have to ensure that the dealloc will be after the last use of all - // aliases of the given value. We first assume that there are no uses in - // the placementBlock and that we can safely place the dealloc at the - // beginning. - Operation *endOperation = &placementBlock->front(); - - // Iterate over all aliases and ensure that the endOperation will point - // to the last operation of all potential aliases in the placementBlock. - for (Value alias : aliasesSet) { - // Ensure that the start operation is at least the defining operation of - // the current alias to avoid invalid placement of deallocs for aliases - // without any uses. - Operation *beforeOp = endOperation; - if (alias.getDefiningOp() && - !(beforeOp = placementBlock->findAncestorOpInBlock( - *alias.getDefiningOp()))) - continue; - - Operation *aliasEndOperation = - livenessInfo->getEndOperation(alias, beforeOp); - // Check whether the aliasEndOperation lies in the desired block and - // whether it is behind the current endOperation. If yes, this will be - // the new endOperation. - if (aliasEndOperation->getBlock() == placementBlock && - endOperation->isBeforeInBlock(aliasEndOperation)) - endOperation = aliasEndOperation; - } - // endOperation is the last operation behind which we can safely store - // the dealloc taking all potential aliases into account. - - // If there is an existing dealloc, move it to the right place. - Operation *deallocOperation = std::get<1>(entry); - if (deallocOperation) { - deallocOperation->moveAfter(endOperation); - } else { - // If the Dealloc position is at the terminator operation of the - // block, then the value should escape from a deallocation. - Operation *nextOp = endOperation->getNextNode(); - if (!nextOp) - continue; - // If there is no dealloc node, insert one in the right place. - if (failed(buildDealloc(nextOp, alloc))) - return failure(); - } - } - return success(); - } - - /// Builds a deallocation operation compatible with the given allocation - /// value. If there is no registered AllocationOpInterface implementation for - /// the given value (e.g. in the case of a function parameter), this method - /// builds a memref::DeallocOp. - LogicalResult buildDealloc(Operation *op, Value alloc) { - OpBuilder builder(op); - auto it = aliasToAllocations.find(alloc); - if (it != aliasToAllocations.end()) { - // Call the allocation op interface to build a supported and - // compatible deallocation operation. - auto dealloc = it->second.buildDealloc(builder, alloc); - if (!dealloc) - return op->emitError() - << "allocations without compatible deallocations are " - "not supported"; - } else { - // Build a "default" DeallocOp for unknown allocation sources. - builder.create(alloc.getLoc(), alloc); - } - return success(); - } - - /// Builds a clone operation compatible with the given allocation value. If - /// there is no registered AllocationOpInterface implementation for the given - /// value (e.g. in the case of a function parameter), this method builds a - /// bufferization::CloneOp. - FailureOr buildClone(Operation *op, Value alloc) { - OpBuilder builder(op); - auto it = aliasToAllocations.find(alloc); - if (it != aliasToAllocations.end()) { - // Call the allocation op interface to build a supported and - // compatible clone operation. - auto clone = it->second.buildClone(builder, alloc); - if (clone) - return *clone; - return (LogicalResult)(op->emitError() - << "allocations without compatible clone ops " - "are not supported"); - } - // Build a "default" CloneOp for unknown allocation sources. - return builder.create(alloc.getLoc(), alloc) - .getResult(); - } - - /// The dominator info to find the appropriate start operation to move the - /// allocs. - DominanceInfo dominators; - - /// The post dominator info to move the dependent allocs in the right - /// position. - PostDominanceInfo postDominators; - - /// Stores already cloned buffers to avoid additional clones of clones. - ValueSetT clonedValues; - - /// Maps aliases to their source allocation interfaces (inverse mapping). - AliasAllocationMapT aliasToAllocations; -}; - -//===----------------------------------------------------------------------===// -// BufferDeallocationPass -//===----------------------------------------------------------------------===// - -/// The actual buffer deallocation pass that inserts and moves dealloc nodes -/// into the right positions. Furthermore, it inserts additional clones if -/// necessary. It uses the algorithm described at the top of the file. -struct BufferDeallocationPass - : public bufferization::impl::BufferDeallocationBase< - BufferDeallocationPass> { - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - registry.insert(); - } - - void runOnOperation() override { - func::FuncOp func = getOperation(); - if (func.isExternal()) - return; - - if (failed(deallocateBuffers(func))) - signalPassFailure(); - } -}; - -} // namespace - -LogicalResult bufferization::deallocateBuffers(Operation *op) { - if (isa(op)) { - WalkResult result = op->walk([&](func::FuncOp funcOp) { - if (failed(deallocateBuffers(funcOp))) - return WalkResult::interrupt(); - return WalkResult::advance(); - }); - return success(!result.wasInterrupted()); - } - - // Ensure that there are supported loops only. - Backedges backedges(op); - if (backedges.size()) { - op->emitError("Only structured control-flow loops are supported."); - return failure(); - } - - // Check that the control flow structures are supported. - if (!validateSupportedControlFlow(op)) - return failure(); - - // Gather all required allocation nodes and prepare the deallocation phase. - BufferDeallocation deallocation(op); - - // Check for supported AllocationOpInterface implementations and prepare the - // internal deallocation pass. - if (failed(deallocation.prepare())) - return failure(); - - // Place all required temporary clone and dealloc nodes. - if (failed(deallocation.deallocate())) - return failure(); - - return success(); -} - -//===----------------------------------------------------------------------===// -// BufferDeallocationPass construction -//===----------------------------------------------------------------------===// - -std::unique_ptr mlir::bufferization::createBufferDeallocationPass() { - return std::make_unique(); -} diff --git a/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt index 50104e8f8346b..7c38621be1bb5 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt @@ -1,6 +1,5 @@ add_mlir_dialect_library(MLIRBufferizationTransforms Bufferize.cpp - BufferDeallocation.cpp BufferDeallocationSimplification.cpp BufferOptimizations.cpp BufferResultsToOutParams.cpp diff --git a/mlir/lib/Dialect/DLTI/DLTI.cpp b/mlir/lib/Dialect/DLTI/DLTI.cpp index 2510e774f2b2a..b057554c40d8c 100644 --- a/mlir/lib/Dialect/DLTI/DLTI.cpp +++ b/mlir/lib/Dialect/DLTI/DLTI.cpp @@ -508,6 +508,9 @@ getClosestQueryable(Operation *op) { FailureOr dlti::query(Operation *op, ArrayRef keys, bool emitError) { + if (!op) + return failure(); + if (keys.empty()) { if (emitError) { auto diag = op->emitError() << "target op of failed DLTI query"; @@ -562,6 +565,19 @@ dlti::query(Operation *op, ArrayRef keys, bool emitError) { return currentAttr; } +FailureOr dlti::query(Operation *op, ArrayRef keys, + bool emitError) { + if (!op) + return failure(); + + MLIRContext *ctx = op->getContext(); + SmallVector entryKeys(keys.size()); + for (StringRef key : keys) + entryKeys.push_back(StringAttr::get(ctx, key)); + + return dlti::query(op, entryKeys, emitError); +} + constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutAttrName; constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutEndiannessKey; constexpr const StringLiteral mlir::DLTIDialect::kDataLayoutEndiannessBig; diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp index d79399b6588be..c906f3bdcc632 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp @@ -399,6 +399,18 @@ bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp, if (!genericOp->getResult(0).hasOneUse()) return failure(); + // TODO: Add an option for allowing padding values. It could introduce + // undefined behavior if we unconditionally propagate pack op through all + // the ops. E.g., if the padding value is zero and there are division ops in + // a generic op. Some values of padding area could be NaN (0/0). + if (packOp.getPaddingValue()) + return failure(); + + OpOperand *opOperand = genericOp.getDpsInitOperand(0); + auto packInfo = getPackingInfoFromOperand(opOperand, genericOp, packOp); + if (failed(packInfo)) + return failure(); + // We want to move the pack not the generic. OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(genericOp); @@ -422,18 +434,6 @@ bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp, return failure(); } - // TODO: Add an option for allowing padding values. It could introduce - // undefined behavior if we unconditionally propagate pack op through all - // the ops. E.g., if the padding value is zero and there are division ops in - // a generic op. Some values of padding area could be NaN (0/0). - if (packOp.getPaddingValue()) - return failure(); - - OpOperand *opOperand = genericOp.getDpsInitOperand(0); - auto packInfo = getPackingInfoFromOperand(opOperand, genericOp, packOp); - if (failed(packInfo)) - return failure(); - // Rebuild the indexing map for the corresponding init operand. auto [packedOutOperand, packedOutIndexingMap] = getOrCreatePackedViewOfOperand(rewriter, genericOp.getLoc(), *packInfo, diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp index 33460ff25e9e4..304ede195c762 100644 --- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp +++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp @@ -194,6 +194,12 @@ static void shardShape(const InShape &inShape, const MeshShape &meshShape, const SplitAxes &splitAxes, OutShape &outShape, ArrayRef shardedDimsOffsets = {}, ArrayRef haloSizes = {}) { + // 0d tensors cannot be sharded and must get replicated + if (inShape.empty()) { + assert(outShape.empty()); + return; + } + std::copy(llvm::adl_begin(inShape), llvm::adl_end(inShape), llvm::adl_begin(outShape)); @@ -271,7 +277,8 @@ Type mesh::shardType(Type type, MeshOp mesh, MeshSharding sharding) { void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshSharding sharding, OpOperand &operand, - OpBuilder &builder) { + OpBuilder &builder, + ShardOp &newShardOp) { OpBuilder::InsertionGuard insertionGuard(builder); Value operandValue = operand.get(); Operation *operandOp = operand.getOwner(); @@ -279,14 +286,20 @@ void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshSharding sharding, ShardOp shardOp = dyn_cast(operandOp); if (shardOp && sharding == shardOp.getSharding() && !shardOp.getAnnotateForUsers()) { - // No need for anything the correct sharding is already set. + // No need for anything if the correct sharding is already set. + if (!newShardOp) { + newShardOp = shardOp; + } return; } - auto shardingOp = builder.create(operandValue.getLoc(), sharding); - auto newShardOp = - builder.create(operandValue.getLoc(), operandValue, shardingOp, - /*annotate_for_users*/ false); + if (!newShardOp) { + auto shardingOp = + builder.create(operandValue.getLoc(), sharding); + newShardOp = + builder.create(operandValue.getLoc(), operandValue, shardingOp, + /*annotate_for_users*/ false); + } IRRewriter rewriter(builder); rewriter.replaceUsesWithIf( operandValue, newShardOp, [operandOp, operandValue](OpOperand &use) { @@ -297,17 +310,19 @@ void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshSharding sharding, return; } - auto newShardOp2 = - builder.create(operandValue.getLoc(), newShardOp, shardingOp, - /*annotate_for_users*/ true); + auto newShardOp2 = builder.create(operandValue.getLoc(), newShardOp, + newShardOp.getSharding(), + /*annotate_for_users*/ true); rewriter.replaceAllUsesExcept(newShardOp, newShardOp2, newShardOp2); + return; } void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshSharding sharding, OpResult result, OpBuilder &builder) { + ShardOp newShardOp; for (auto &use : llvm::make_early_inc_range(result.getUses())) { - maybeInsertTargetShardingAnnotation(sharding, use, builder); + maybeInsertTargetShardingAnnotation(sharding, use, builder, newShardOp); } } @@ -316,9 +331,19 @@ void mlir::mesh::maybeInsertSourceShardingAnnotation(MeshSharding sharding, OpBuilder &builder) { OpBuilder::InsertionGuard insertionGuard(builder); Value operandValue = operand.get(); - Operation *operandOp = operand.getOwner(); Operation *operandSrcOp = operandValue.getDefiningOp(); bool isBlockArg = !operandSrcOp; + { + [[maybe_unused]] auto opType = + dyn_cast(operandValue.getType()); + assert(!opType || opType.getRank() > 0 || isFullReplication(sharding)); + } + if (!isa(operandValue.getType()) && operandSrcOp && + operandSrcOp->hasTrait()) { + return; + } + + Operation *operandOp = operand.getOwner(); ShardOp shardOp = dyn_cast_or_null(operandSrcOp); if (shardOp && sharding == shardOp.getSharding() && @@ -432,16 +457,14 @@ void ShardingOp::build(::mlir::OpBuilder &b, ::mlir::OperationState &odsState, ArrayRef split_axes, ArrayRef partial_axes, mesh::ReductionKind partial_type, - ArrayRef static_halo_sizes, - ArrayRef static_sharded_dims_offsets) { + ArrayRef static_halos, + ArrayRef static_offsets) { return build( b, odsState, mesh, MeshAxesArrayAttr::get(b.getContext(), split_axes), ::mlir::DenseI16ArrayAttr::get(b.getContext(), partial_axes), ::mlir::mesh::ReductionKindAttr::get(b.getContext(), partial_type), - ::mlir::DenseI64ArrayAttr::get(b.getContext(), static_halo_sizes), {}, - ::mlir::DenseI64ArrayAttr::get(b.getContext(), - static_sharded_dims_offsets), - {}); + ::mlir::DenseI64ArrayAttr::get(b.getContext(), static_halos), {}, + ::mlir::DenseI64ArrayAttr::get(b.getContext(), static_offsets), {}); } void ShardingOp::build(::mlir::OpBuilder &b, ::mlir::OperationState &odsState, @@ -453,6 +476,18 @@ void ShardingOp::build(::mlir::OpBuilder &b, ::mlir::OperationState &odsState, {}, {}, {}, {}); } +void ShardingOp::build(::mlir::OpBuilder &b, ::mlir::OperationState &odsState, + llvm::StringRef mesh, ArrayRef split_axes, + ArrayRef static_halos, + ArrayRef static_offsets) { + return build( + b, odsState, FlatSymbolRefAttr::get(b.getContext(), mesh), + MeshAxesArrayAttr::get(b.getContext(), split_axes), {}, + ::mlir::mesh::ReductionKindAttr::get(b.getContext(), ReductionKind::Sum), + ::mlir::DenseI64ArrayAttr::get(b.getContext(), static_halos), {}, + ::mlir::DenseI64ArrayAttr::get(b.getContext(), static_offsets), {}); +} + void ShardingOp::build( ::mlir::OpBuilder &b, ::mlir::OperationState &odsState, FlatSymbolRefAttr mesh, ArrayRef split_axes, @@ -579,9 +614,10 @@ LogicalResult ShardingOp::verifySymbolUses(SymbolTableCollection &symbolTable) { namespace { // Sharding annotations "halo sizes" and "sharded dims offsets" // are a mix of attributes and dynamic values. This canonicalization moves -// constant values to the respective attribute lists and so minimizes the number +// constant values to the respective attribute lists, minimizing the number // of values. -class FoldDynamicLists final : public OpRewritePattern { +// It also removes sharded_dims_sizes and halos if they are effectively "empty". +class NormalizeSharding final : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; @@ -593,18 +629,48 @@ class FoldDynamicLists final : public OpRewritePattern { op.getDynamicShardedDimsOffsets(), b); // No constant operands were folded, just return; - if (failed(foldDynamicIndexList(mixedHalos, /*onlyNonNegative=*/true)) && - failed(foldDynamicIndexList(mixedOffs, /*onlyNonNegative=*/true))) { - return failure(); + bool modified = succeeded(foldDynamicIndexList(mixedHalos, true)) || + succeeded(foldDynamicIndexList(mixedOffs, true)); + + auto [staticHalos, dynamicHalos] = decomposeMixedValues(mixedHalos); + auto [staticOffs, dynamicOffs] = decomposeMixedValues(mixedOffs); + + if (dynamicHalos.empty() && !staticHalos.empty()) { + if (staticHalos[0] == 0 && llvm::all_equal(staticHalos)) { + staticHalos.clear(); + modified = true; + } + } + + // Remove sharded dims offsets if they are effectively the default values, + // e.g. if they define equi-distance between all neighboring shards. + // Requires static-only offsets. Compares the first distance as the + // difference between the first two offsets. Only if all consecutive + // distances are the same, the offsets are removed. + if (dynamicOffs.empty() && !staticOffs.empty()) { + assert(staticOffs.size() >= 2); + auto diff = staticOffs[1] - staticOffs[0]; + bool all_same = staticOffs.size() > 2; + for (auto i = 2u; i < staticOffs.size(); ++i) { + if (staticOffs[i] - staticOffs[i - 1] != diff) { + all_same = false; + break; + } + } + if (all_same) { + staticOffs.clear(); + modified = true; + } } - auto halos = decomposeMixedValues(mixedHalos); - auto offs = decomposeMixedValues(mixedOffs); + if (!modified) { + return failure(); + } - op.setStaticHaloSizes(halos.first); - op.getDynamicHaloSizesMutable().assign(halos.second); - op.setStaticShardedDimsOffsets(offs.first); - op.getDynamicShardedDimsOffsetsMutable().assign(offs.second); + op.setStaticHaloSizes(staticHalos); + op.getDynamicHaloSizesMutable().assign(dynamicHalos); + op.setStaticShardedDimsOffsets(staticOffs); + op.getDynamicShardedDimsOffsetsMutable().assign(dynamicOffs); return success(); } @@ -613,7 +679,7 @@ class FoldDynamicLists final : public OpRewritePattern { void ShardingOp::getCanonicalizationPatterns(mlir::RewritePatternSet &results, mlir::MLIRContext *context) { - results.add(context); + results.add(context); } //===----------------------------------------------------------------------===// @@ -707,11 +773,19 @@ bool MeshSharding::operator!=(const MeshSharding &rhs) const { return !(*this == rhs); } +MeshSharding::MeshSharding(::mlir::FlatSymbolRefAttr mesh_) : mesh(mesh_) {} + MeshSharding::MeshSharding(Value rhs) { auto shardingOp = mlir::dyn_cast(rhs.getDefiningOp()); assert(shardingOp && "expected sharding op"); - *this = get(shardingOp.getMeshAttr(), shardingOp.getSplitAxes().getAxes(), - shardingOp.getPartialAxes().value_or(ArrayRef()), + auto splitAxes = shardingOp.getSplitAxes().getAxes(); + auto partialAxes = shardingOp.getPartialAxes().value_or(ArrayRef()); + // If splitAxes and partialAxes are empty, use "empty" constructor. + if (splitAxes.empty() && partialAxes.empty()) { + *this = MeshSharding(shardingOp.getMeshAttr()); + return; + } + *this = get(shardingOp.getMeshAttr(), splitAxes, partialAxes, shardingOp.getPartialType().value_or(ReductionKind::Sum), shardingOp.getStaticHaloSizes(), shardingOp.getStaticShardedDimsOffsets(), @@ -727,8 +801,11 @@ MeshSharding MeshSharding::get(::mlir::FlatSymbolRefAttr mesh_, ArrayRef static_sharded_dims_offsets_, ArrayRef dynamic_halo_sizes_, ArrayRef dynamic_sharded_dims_offsets_) { - MeshSharding res; - res.mesh = mesh_; + MeshSharding res(mesh_); + if (split_axes_.empty() && partial_axes_.empty()) { + return res; + } + res.split_axes.resize(split_axes_.size()); for (auto [i, axis] : llvm::enumerate(split_axes_)) { res.split_axes[i] = @@ -771,6 +848,53 @@ void ShardOp::getAsmResultNames( setNameFn(getResult(), "sharding_annotated"); } +namespace { +// Determine if the given ShardOp is a duplicate of another ShardOp +// on the same value. This can happen if constant values are sharded. +class FoldDuplicateShardOp final : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(ShardOp op, PatternRewriter &b) const override { + // Get the use-list of the value being sharded and check if it has more than + // one use. + Value value = op.getSrc(); + if (value.hasOneUse() || value.getDefiningOp()) { + return failure(); + } + + // Iterate through the uses of the value to find a duplicate ShardOp. + for (auto &use : value.getUses()) { + if (use.getOwner() != op.getOperation()) { + auto otherOp = dyn_cast(use.getOwner()); + if (!otherOp || !otherOp->isBeforeInBlock(op)) { + return failure(); + } + // Create a MeshSharding object for the current and the other ShardOp + // If the two are equal replace current op with the other op. + MeshSharding currentSharding(op.getSharding()); + MeshSharding otherSharding(otherOp.getSharding()); + if (currentSharding == otherSharding) { + b.replaceAllUsesWith(op.getResult(), otherOp.getResult()); + b.eraseOp(op.getOperation()); + } else { + // use the other sharding as input for op + op.getSrcMutable().assign(otherOp.getResult()); + } + return success(); + } + } + + return failure(); + } +}; +} // namespace + +void ShardOp::getCanonicalizationPatterns(mlir::RewritePatternSet &results, + mlir::MLIRContext *context) { + results.add(context); +} + //===----------------------------------------------------------------------===// // mesh.process_multi_index op //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp index c1f4d563d5b42..f427d004c558f 100644 --- a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp +++ b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp @@ -168,17 +168,12 @@ LogicalResult mesh::ShardingInterface::verifyShardingInterfaceImpl() { // check operands and results type for (Type type : op->getOperandTypes()) - if (!llvm::isa(type)) + if (!llvm::isa(type) && !type.isIntOrIndexOrFloat()) return failure(); for (Type type : op->getResultTypes()) - if (!llvm::isa(type)) + if (!llvm::isa(type) && !type.isIntOrIndexOrFloat()) return failure(); - // check loop types - SmallVector loopTypes = getLoopIteratorTypes(); - if (loopTypes.empty()) - return failure(); - // check maps SmallVector maps = getIndexingMaps(); if (maps.empty()) @@ -286,18 +281,22 @@ mesh::detail::defaultGetShardingOption(Operation *op, continue; AffineMap map = maps[numOperands + shardingIt.index()]; anyShardingInResultsOrOperands = true; - // Handle the split axes: calculate the corresponding loop index for each - // split axes sub-array, and then store the sub-array to - // shardingOption[index] - for (auto it : llvm::zip(map.getResults(), shardAttr.getSplitAxes())) { - AffineExpr expr = std::get<0>(it); - ArrayRef axes = std::get<1>(it).asArrayRef(); - auto dim = cast(expr); - unsigned index = dim.getPosition(); - visitedLoopIndices.insert(index); - if (failed(fillShardingOption(op, shardingOption, shardAttr.getMeshAttr(), - axes, index))) - return failure(); + if (shardAttr.getSplitAxes().empty() || map.getResults().empty()) { + shardingOption.mesh = shardAttr.getMeshAttr(); + } else { + // Handle the split axes: calculate the corresponding loop index for each + // split axes sub-array, and then store the sub-array to + // shardingOption[index] + for (auto it : llvm::zip(map.getResults(), shardAttr.getSplitAxes())) { + AffineExpr expr = std::get<0>(it); + ArrayRef axes = std::get<1>(it).asArrayRef(); + auto dim = cast(expr); + unsigned index = dim.getPosition(); + visitedLoopIndices.insert(index); + if (failed(fillShardingOption(op, shardingOption, + shardAttr.getMeshAttr(), axes, index))) + return failure(); + } } // Handle the partial axes: at this stage, the exact loop index/indices @@ -323,7 +322,7 @@ mesh::detail::defaultGetShardingOption(Operation *op, if (!shardAttr) continue; - anyShardingInResultsOrOperands = true; + anyShardingInResultsOrOperands = !shardAttr.getSplitAxes().empty(); AffineMap map = maps[shardingIt.index()]; unsigned numDims = map.getNumDims(); @@ -448,7 +447,16 @@ static FailureOr getSharding(OpOperand &opOperand, const ShardingOption &shardingOption, AffineMap map) { Value operandValue = opOperand.get(); - auto operandType = cast(operandValue.getType()); + auto operandType = dyn_cast(operandValue.getType()); + if (!operandType) { + if (operandValue.getType().isIntOrIndexOrFloat()) + return MeshSharding(); + return failure(); + } + // 0d tensors cannot be sharded and must get replicated + if (operandType.getRank() == 0) { + return MeshSharding(shardingOption.mesh); + } SmallVector> splitAxes(operandType.getRank()); unsigned numDims = map.getNumDims(); for (auto it : llvm::enumerate(map.getResults())) { @@ -579,7 +587,7 @@ static bool isValueCompatibleWithFullReplicationSharding(Value value, MeshSharding sharding) { if (isa(value.getType())) { - return sharding && isFullReplication(sharding); + return isFullReplication(sharding); } return !sharding; diff --git a/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp b/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp index 4bd3b425219c1..8c989cce63406 100644 --- a/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp +++ b/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp @@ -282,11 +282,12 @@ static FailureOr selectShardingOption( // a `mesh.shard` operation for all remaining operands and results that do not // have sharding annotations. static LogicalResult visitOp(Operation *op, OpBuilder &builder) { + ShardingInterface shardingOp = llvm::dyn_cast(op); if (op->hasTrait() || - llvm::isa(op)) + (op->hasTrait() && !shardingOp) || + llvm::isa(op)) return success(); - ShardingInterface shardingOp = llvm::dyn_cast(op); if (!shardingOp) { op->emitOpError() << "sharding interface is not implemented."; return failure(); diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp index 327ea0991e4e1..601af0200e785 100644 --- a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp +++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp @@ -561,7 +561,8 @@ TypedValue reshard(ImplicitLocOpBuilder &builder, MeshOp mesh, TypedValue sourceUnshardedValue, TypedValue sourceShard) { // If source and destination sharding are the same, no need to do anything. - if (sourceSharding == targetSharding) { + if (sourceSharding == targetSharding || (isFullReplication(sourceSharding) && + isFullReplication(targetSharding))) { return sourceShard; } @@ -636,14 +637,6 @@ shardedBlockArgumentTypes(Block &block, return res; } -void spmdizeTriviallyShardableOperation(Operation &op, - ArrayRef spmdizedOperands, - ArrayRef operandShardings, - ArrayRef resultShardings, - IRMapping &spmdizationMap, - SymbolTableCollection &symbolTable, - OpBuilder &builder); - static LogicalResult spmdizeOperation( Operation &op, ArrayRef spmdizedOperands, ArrayRef operandShardings, @@ -703,8 +696,9 @@ static std::vector getResultShardings(Operation &op) { if (!rankedTensor) { return MeshSharding(); } - - assert(result.hasOneUse()); + if (!result.hasOneUse()) { + return MeshSharding(); + } Operation *userOp = *result.getUsers().begin(); ShardOp shardOp = llvm::cast(userOp); return MeshSharding(shardOp.getSharding()); @@ -744,6 +738,15 @@ spmdizeOperation(Operation &op, IRMapping &spmdizationMap, if (isa(op)) { return success(); } + if (auto getShardingOp = dyn_cast(op)) { + auto shardOp = getShardingOp.getSource().getDefiningOp(); + if (!shardOp) { + return op.emitError("expected a shard op as source of get_sharding"); + } + auto newSharding = builder.clone(*shardOp.getSharding().getDefiningOp()); + spmdizationMap.map(op.getResult(0), newSharding->getResult(0)); + return success(); + } ShardOp shardOp = llvm::dyn_cast(op); if (shardOp) { @@ -765,6 +768,7 @@ spmdizeOperation(Operation &op, IRMapping &spmdizationMap, static LogicalResult spmdizeBlock(Block &block, IRMapping &spmdizationMap, SymbolTableCollection &symbolTableCollection, OpBuilder &builder) { + SmallVector argLocations; llvm::transform(block.getArguments(), std::back_inserter(argLocations), [](BlockArgument arg) { return arg.getLoc(); }); @@ -796,8 +800,12 @@ spmdizeFuncOp(FunctionOpInterface op, IRMapping &spmdizationMap, // Snapshot the original blocks to not mess up the iteration when adding new // blocks. SmallVector originalBlocks; - llvm::transform(op.getBlocks(), std::back_inserter(originalBlocks), - [](Block &b) { return &b; }); + for (Block &b : op.getBlocks()) { + if (llvm::any_of(b.getOperations(), + [](Operation &op) { return isa(op); })) { + originalBlocks.push_back(&b); + } + } for (Block *block : originalBlocks) { if (failed(spmdizeBlock(*block, spmdizationMap, symbolTableCollection, @@ -823,10 +831,11 @@ spmdizeFuncOp(FunctionOpInterface op, IRMapping &spmdizationMap, break; } } - assert(returnOp); - op.setType(FunctionType::get(op->getContext(), - op.getFunctionBody().front().getArgumentTypes(), - returnOp->getOperandTypes())); + if (returnOp) { + op.setType(FunctionType::get( + op->getContext(), op.getFunctionBody().front().getArgumentTypes(), + returnOp->getOperandTypes())); + } return success(); } diff --git a/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp b/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp index f3e72abe7516e..b2acbf20b3fb9 100644 --- a/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp +++ b/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp @@ -22,10 +22,11 @@ using namespace mlir::mesh; namespace { -// Sharding of tensor.empty -struct EmptyOpShardingInterface - : public ShardingInterface::ExternalModel { +// Sharding of tensor.empty/tensor.splat +template +struct CreatorOpShardingInterface + : public ShardingInterface::ExternalModel, + OpTy> { SmallVector getLoopIteratorTypes(Operation *op) const { auto ndims = mlir::cast(op->getResult(0).getType()).getRank(); return SmallVector(ndims, @@ -38,7 +39,9 @@ struct EmptyOpShardingInterface auto type = dyn_cast(val.getType()); if (!type) return {}; - return {AffineMap::getMultiDimIdentityMap(type.getRank(), ctx)}; + return SmallVector( + op->getNumOperands() + op->getNumResults(), + {AffineMap::getMultiDimIdentityMap(type.getRank(), ctx)}); } LogicalResult spmdize(Operation *op, ArrayRef spmdizedOperands, @@ -82,8 +85,7 @@ struct EmptyOpShardingInterface newOperands.emplace_back(spmdizedOperands[++currOldOprndNum]); } } - newOp = - builder.create(op->getLoc(), shardType, newOperands); + newOp = builder.create(op->getLoc(), shardType, newOperands); spmdizationMap.map(op->getResult(0), newOp->getResult(0)); } else { // `clone` will populate the mapping of old to new results. @@ -100,6 +102,9 @@ void mlir::tensor::registerShardingInterfaceExternalModels( DialectRegistry ®istry) { registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) { - EmptyOp::template attachInterface(*ctx); + EmptyOp::template attachInterface>( + *ctx); + SplatOp::template attachInterface>( + *ctx); }); } diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index fda6246334e15..03c2f3843f262 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -354,6 +354,35 @@ bool mlir::tensor::canFoldIntoProducerOp(CastOp castOp) { castOp.getType()); } +bool mlir::tensor::hasFoldableTensorCastOperand(Operation *op) { + return llvm::any_of(op->getOpOperands(), [&](OpOperand &opOperand) { + if (llvm::isa(opOperand.get())) + return false; + auto castOp = opOperand.get().getDefiningOp(); + return castOp && canFoldIntoConsumerOp(castOp); + }); +} + +SmallVector mlir::tensor::getUpdatedOperandsAfterCastOpFolding( + DestinationStyleOpInterface op, SmallVector &newResTy) { + SmallVector newOperands; + newOperands.reserve(op->getNumOperands()); + + assert(hasFoldableTensorCastOperand(op) && "No foldable CastOp operands!"); + + // Assumes that the result has dpsInits followed by nonDpsInits. + int64_t dpsInitIdx = 0; + for (OpOperand &opOperand : op->getOpOperands()) { + auto tensorCastOp = opOperand.get().getDefiningOp(); + bool fold = canFoldIntoConsumerOp(tensorCastOp); + newOperands.push_back(fold ? tensorCastOp.getOperand() : opOperand.get()); + if (op.isDpsInit(&opOperand) && + !llvm::isa(newOperands.back().getType())) + newResTy[dpsInitIdx++] = newOperands.back().getType(); + } + return newOperands; +} + /// Performs folding of any operand of `op` if it comes from a tensor::CastOp /// that can be folded. LogicalResult mlir::tensor::foldTensorCast(Operation *op) { @@ -4777,34 +4806,7 @@ bool foldTensorCastPrecondition(DestinationStyleOpInterface op) { isa(op.getOperation())) return false; - // If no operand comes from a tensor::CastOp and can be folded then fail. - bool hasTensorCastOperand = - llvm::any_of(op->getOpOperands(), [&](OpOperand &opOperand) { - if (llvm::isa(opOperand.get())) - return false; - auto castOp = opOperand.get().getDefiningOp(); - return castOp && canFoldIntoConsumerOp(castOp); - }); - - return hasTensorCastOperand; -} - -static SmallVector getNewOperands(DestinationStyleOpInterface op, - SmallVector &newResTy) { - SmallVector newOperands; - newOperands.reserve(op->getNumOperands()); - - // Assumes that the result has dpsInits followed by nonDpsInits. - int64_t dpsInitIdx = 0; - for (OpOperand &opOperand : op->getOpOperands()) { - auto tensorCastOp = opOperand.get().getDefiningOp(); - bool fold = canFoldIntoConsumerOp(tensorCastOp); - newOperands.push_back(fold ? tensorCastOp.getOperand() : opOperand.get()); - if (op.isDpsInit(&opOperand) && - !llvm::isa(newOperands.back().getType())) - newResTy[dpsInitIdx++] = newOperands.back().getType(); - } - return newOperands; + return hasFoldableTensorCastOperand(op); } // Given the (potentially) updated packed type, `newPackedTy`, generates an @@ -4868,7 +4870,8 @@ struct FoldTensorCastPackOp : public OpRewritePattern { return failure(); SmallVector newResultTypes(op->getResultTypes()); - SmallVector newOperands = getNewOperands(op, newResultTypes); + SmallVector newOperands = + getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); // Get the updated mixed-tile-sizes attribute. SmallVector newMixedTileSizes = @@ -4920,7 +4923,8 @@ struct FoldTensorCastUnPackOp : public OpRewritePattern { return failure(); SmallVector newResultTypes(op->getResultTypes()); - SmallVector newOperands = getNewOperands(op, newResultTypes); + SmallVector newOperands = + getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); Value sourceTensor = newOperands[0]; // Get the updated mixed-tile-sizes attribute. @@ -4980,7 +4984,8 @@ struct FoldTensorCastProducerOp return failure(); SmallVector newResultTypes(op->getResultTypes()); - SmallVector newOperands = getNewOperands(op, newResultTypes); + SmallVector newOperands = + getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); // Clone op auto newOp = clone(rewriter, op, newResultTypes, newOperands); diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp index a9a65ac271b3c..69b3f6d674167 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp @@ -287,10 +287,12 @@ struct ClampIsNoOp : public OpRewritePattern { if (isa(inputElementType)) { // Unlike integer types, floating point types can represent infinity. - auto minClamp = op.getMinFp(); - auto maxClamp = op.getMaxFp(); - bool isMin = minClamp.isInfinity() && minClamp.isNegative(); - bool isMax = maxClamp.isInfinity() && !maxClamp.isNegative(); + auto minClamp = + llvm::cast(op.getMinValAttr()).getValue(); + auto maxClamp = + llvm::cast(op.getMaxValAttr()).getValue(); + bool isMin = minClamp.isNegInfinity(); + bool isMax = maxClamp.isInfinity(); if (isMin && isMax) { rewriter.replaceOp(op, input); @@ -300,8 +302,10 @@ struct ClampIsNoOp : public OpRewritePattern { } if (inputElementType.isUnsignedInteger()) { - int64_t minClamp = op.getMinInt(); - int64_t maxClamp = op.getMaxInt(); + int64_t minClamp = + llvm::cast(op.getMinValAttr()).getUInt(); + int64_t maxClamp = + llvm::cast(op.getMaxValAttr()).getUInt(); int64_t intMin = APInt::getMinValue(inputElementType.getIntOrFloatBitWidth()) @@ -318,8 +322,10 @@ struct ClampIsNoOp : public OpRewritePattern { } if (llvm::isa(inputElementType)) { - int64_t minClamp = op.getMinInt(); - int64_t maxClamp = op.getMaxInt(); + int64_t minClamp = + llvm::cast(op.getMinValAttr()).getInt(); + int64_t maxClamp = + llvm::cast(op.getMaxValAttr()).getInt(); int64_t intMin = APInt::getSignedMinValue(inputElementType.getIntOrFloatBitWidth()) @@ -374,9 +380,10 @@ struct ClampClampOptimization : public OpRewritePattern { LogicalResult matchAndRewrite(tosa::ClampOp op, PatternRewriter &rewriter) const override { + Value input = op.getInput(); + // Check the input to the CLAMP op is itself a CLAMP. - auto clampOp = - dyn_cast_if_present(op.getInput().getDefiningOp()); + auto clampOp = dyn_cast_if_present(input.getDefiningOp()); if (!clampOp) return failure(); @@ -386,34 +393,86 @@ struct ClampClampOptimization : public OpRewritePattern { if (opNanMode == "IGNORE" && clampNanMode == "PROPAGATE") return failure(); - // Check we have intersecting ranges. - const auto opMinInt = op.getMinInt(); - const auto opMaxInt = op.getMaxInt(); - const auto clampOpMinInt = clampOp.getMinInt(); - const auto clampOpMaxInt = clampOp.getMaxInt(); - ClampRange opRangeIntRange(opMinInt, opMaxInt); - ClampRange clampRangeIntRange(clampOpMinInt, clampOpMaxInt); - if (!opRangeIntRange.intersects(clampRangeIntRange)) - return failure(); + auto maxValAttr = op.getMaxValAttr(); + auto minValAttr = op.getMinValAttr(); + auto clampOpMaxValAttr = clampOp.getMaxValAttr(); + auto clampOpMinValAttr = clampOp.getMinValAttr(); - const auto opMinFloat = op.getMinFp(); - const auto opMaxFloat = op.getMaxFp(); - const auto clampOpMinFloat = clampOp.getMinFp(); - const auto clampOpMaxFloat = clampOp.getMaxFp(); - ClampRange opRangeFloatRange(opMinFloat, opMaxFloat); - ClampRange clampRangeFloatRange(clampOpMinFloat, clampOpMaxFloat); - if (!opRangeFloatRange.intersects(clampRangeFloatRange)) - return failure(); + auto inputEType = llvm::cast(input.getType()).getElementType(); + if (auto quantType = + llvm::dyn_cast(inputEType)) { + inputEType = quantType.getStorageType(); + } + + Attribute newMinValAttr, newMaxValAttr; + if (mlir::isa(inputEType)) { + auto floatMaxValAttr = cast(maxValAttr); + auto floatMinValAttr = cast(minValAttr); + auto clampOpFloatMaxValAttr = cast(clampOpMaxValAttr); + auto clampOpFloatMinValAttr = cast(clampOpMinValAttr); + + // Check we have intersecting ranges. + const auto opMinFloat = floatMinValAttr.getValue(); + const auto opMaxFloat = floatMaxValAttr.getValue(); + const auto clampOpMinFloat = clampOpFloatMinValAttr.getValue(); + const auto clampOpMaxFloat = clampOpFloatMaxValAttr.getValue(); + ClampRange opRangeFloatRange(opMinFloat, opMaxFloat); + ClampRange clampRangeFloatRange(clampOpMinFloat, + clampOpMaxFloat); + if (!opRangeFloatRange.intersects(clampRangeFloatRange)) + return failure(); + + // Run the transformation. + auto newMinVal = std::max(opMinFloat, clampOpMinFloat); + auto newMaxVal = std::min(opMaxFloat, clampOpMaxFloat); + newMinValAttr = rewriter.getFloatAttr(inputEType, newMinVal); + newMaxValAttr = rewriter.getFloatAttr(inputEType, newMaxVal); + } else { + assert(mlir::isa(inputEType)); + auto intMaxValAttr = cast(maxValAttr); + auto intMinValAttr = cast(minValAttr); + auto clampOpIntMaxValAttr = cast(clampOpMaxValAttr); + auto clampOpIntMinValAttr = cast(clampOpMinValAttr); + + if (inputEType.isUnsignedInteger()) { + // Check we have intersecting ranges. + const auto opMinInt = intMinValAttr.getUInt(); + const auto opMaxInt = intMaxValAttr.getUInt(); + const auto clampOpMinInt = clampOpIntMinValAttr.getUInt(); + const auto clampOpMaxInt = clampOpIntMaxValAttr.getUInt(); + ClampRange opRangeIntRange(opMinInt, opMaxInt); + ClampRange clampRangeIntRange(clampOpMinInt, + clampOpMaxInt); + if (!opRangeIntRange.intersects(clampRangeIntRange)) + return failure(); + + // Run the transformation. + auto newMinVal = std::max(opMinInt, clampOpMinInt); + auto newMaxVal = std::min(opMaxInt, clampOpMaxInt); + newMinValAttr = rewriter.getIntegerAttr(inputEType, newMinVal); + newMaxValAttr = rewriter.getIntegerAttr(inputEType, newMaxVal); + } else { + // Check we have intersecting ranges. + const auto opMinInt = intMinValAttr.getInt(); + const auto opMaxInt = intMaxValAttr.getInt(); + const auto clampOpMinInt = clampOpIntMinValAttr.getInt(); + const auto clampOpMaxInt = clampOpIntMaxValAttr.getInt(); + ClampRange opRangeIntRange(opMinInt, opMaxInt); + ClampRange clampRangeIntRange(clampOpMinInt, + clampOpMaxInt); + if (!opRangeIntRange.intersects(clampRangeIntRange)) + return failure(); + + // Run the transformation. + auto newMinVal = std::max(opMinInt, clampOpMinInt); + auto newMaxVal = std::min(opMaxInt, clampOpMaxInt); + newMinValAttr = rewriter.getIntegerAttr(inputEType, newMinVal); + newMaxValAttr = rewriter.getIntegerAttr(inputEType, newMaxVal); + } + } - // Run the transformation. - const auto minFp = std::max(opMinFloat, clampOpMinFloat).convertToFloat(); - const auto maxFp = std::min(opMaxFloat, clampOpMaxFloat).convertToFloat(); - const auto minInt = std::max(opMinInt, clampOpMinInt); - const auto maxInt = std::min(opMaxInt, clampOpMaxInt); rewriter.replaceOpWithNewOp( - op, op.getType(), clampOp.getInput(), - rewriter.getI64IntegerAttr(minInt), rewriter.getI64IntegerAttr(maxInt), - rewriter.getF32FloatAttr(minFp), rewriter.getF32FloatAttr(maxFp), + op, op.getType(), clampOp.getInput(), newMinValAttr, newMaxValAttr, rewriter.getStringAttr((opNanMode != clampNanMode) ? "IGNORE" : opNanMode)); return success(); diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index af4a5dc96265e..e782838ad97f9 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -476,26 +476,40 @@ LogicalResult tosa::ClampOp::verify() { llvm::dyn_cast(inputETy)) { inputETy = quantType.getStorageType(); } - mlir::Type maxFpType = getMaxFpAttr().getType(); - mlir::Type minFpType = getMinFpAttr().getType(); mlir::Type outputETy = llvm::cast(getOutput().getType()).getElementType(); if (auto quantType = llvm::dyn_cast(outputETy)) { outputETy = quantType.getStorageType(); } - unsigned dataTypeBitWidth = inputETy.getIntOrFloatBitWidth(); - if (inputETy != outputETy) return emitOpError("input/output element types are incompatible."); - // If input datatype is float, check that the two min/max_fp attributes - // share the same type and that their type is either the same of the input's - // datatype, or a float type whose bitwidth > input datatype bitwidth. - if (!inputETy.isInteger(dataTypeBitWidth)) { - if (((maxFpType != minFpType) || - (maxFpType != inputETy && maxFpType.getIntOrFloatBitWidth() <= - inputETy.getIntOrFloatBitWidth()))) + auto maxValAttr = getMaxValAttr(); + auto minValAttr = getMinValAttr(); + + unsigned dataTypeBitWidth = inputETy.getIntOrFloatBitWidth(); + + if (inputETy.isInteger(dataTypeBitWidth)) { + // if input datatype is integer, check that the min_val/max_val attributes + // are integer attributes, and that their type is the same as the input's + // datatype + auto intMaxValAttr = mlir::dyn_cast(maxValAttr); + auto intMinValAttr = mlir::dyn_cast(minValAttr); + if (!intMaxValAttr || !intMinValAttr || + (intMaxValAttr.getType() != intMinValAttr.getType()) || + (intMaxValAttr.getType() != inputETy)) + return emitOpError("min/max attributes types are incompatible with " + "input/output element types."); + } else { + // otherwise, input datatype is float, check that the min_val/max_val + // attributes share the same type and that their type is the same as the + // input's datatype + auto floatMaxValAttr = mlir::dyn_cast(maxValAttr); + auto floatMinValAttr = mlir::dyn_cast(minValAttr); + if (!floatMaxValAttr || !floatMinValAttr || + (floatMaxValAttr.getType() != floatMinValAttr.getType()) || + (floatMaxValAttr.getType() != inputETy)) return emitOpError("min/max attributes types are incompatible with " "input/output element types."); } @@ -1116,16 +1130,10 @@ LogicalResult tosa::MulOp::inferReturnTypeComponents( ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions, SmallVectorImpl &inferredReturnShapes) { - LogicalResult status = success(); + // mul op's output shape only depend on input1 and input2, not on shift + ValueShapeRange twoInputs = operands.drop_back(); llvm::SmallVector outShape; - if (operands.size() == 2) { - status = resolveBroadcastShape(operands, outShape); - } else { - // mul op's output shape only depend on input1 and input2, not on shift - ValueShapeRange two_inputs = operands.drop_back(); - status = resolveBroadcastShape(two_inputs, outShape); - } - if (status.failed()) { + if (resolveBroadcastShape(twoInputs, outShape).failed()) { inferredReturnShapes.push_back(ShapedTypeComponents()); } else { inferredReturnShapes.push_back(ShapedTypeComponents(outShape)); @@ -1160,6 +1168,15 @@ LogicalResult tosa::MulOp::verify() { return emitOpError( "requires the same element type for all operands and results"); } + + // verify shift has value 0 for non-integer types + ElementsAttr shift_elem; + if (matchPattern(getShift(), m_Constant(&shift_elem))) { + int32_t shift = shift_elem.getValues()[0].getInt(); + if (shift != 0) { + return emitOpError() << "require shift to be 0 for float type"; + } + } } // Verify the op has same ranks for all main operands (excludes extra operands @@ -2601,6 +2618,10 @@ OpTrait::tosa::verifyTosaShapeOperatorWithSameRanks(Operation *op) { //===----------------------------------------------------------------------===// LogicalResult tosa::ConstShapeOp::verify() { + // check one dimensional rank + auto valuesRank = getValue().getType().getRank(); + if (valuesRank != 1) + return emitOpError("expect elements in attribute value with rank 1"); // check that number of elements in value attr equal to rank of result shape auto count = getValue().getNumElements(); auto rank = (cast(getResult().getType())).getRank(); diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp index 281f0529a5c08..64e5c31793f84 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp @@ -287,8 +287,7 @@ bool TosaReduceTransposes::collectFanIn(Operation *op, for (Value operand : op->getOperands()) { // If this is a problem in future, think about alternatives to recursion. - if (llvm::isa(op) && op->getNumOperands() == 3 && - operand == op->getOperand(2)) { + if (llvm::isa(op) && operand == op->getOperand(2)) { // do not recurse into MulOp's shift operand continue; } @@ -332,8 +331,7 @@ std::optional TosaReduceTransposes::buildMappedToValue( for (Value v : op->getOperands()) { if (valuesMap.contains(v)) { operands.push_back(valuesMap.at(v)); - } else if (llvm::isa(op) && op->getNumOperands() == 3 && - v == op->getOperand(2)) { + } else if (llvm::isa(op) && v == op->getOperand(2)) { // special case for MulOp's shift operand operands.push_back(v); } else { diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 94f9ead9e1665..d5f3634377e4c 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -2395,7 +2395,7 @@ computeBroadcastedUnitDims(ArrayRef srcShape, for (auto [s1, s2] : llvm::zip_equal(srcShape, dstShape.drop_front(rankDiff))) { if (s1 != s2) { - assert(s1 == 1 && "expected dim-1 broadcasting"); + assert(s1 == 1 && "expected \"dim-1\" broadcasting"); res.insert(dstDim); } ++dstDim; @@ -2414,7 +2414,7 @@ llvm::SetVector BroadcastOp::computeBroadcastedUnitDims() { /// Broadcast `value` to a vector of `dstShape`, knowing that exactly the /// `broadcastedDims` dimensions in the dstShape are broadcasted. -/// This requires (and asserts) that the broadcast is free of dim-1 +/// This requires (and asserts) that the broadcast is free of "dim-1" /// broadcasting. /// Since vector.broadcast only allows expanding leading dimensions, an extra /// vector.transpose may be inserted to make the broadcast possible. @@ -2500,10 +2500,10 @@ Value BroadcastOp::createOrFoldBroadcastOp( // 3.c. Append the srcShape. llvm::append_range(broadcastShape, srcVectorType.getShape()); - // Ensure there are no dim-1 broadcasts. + // Ensure there are no "dim-1" broadcasts. assert(::computeBroadcastedUnitDims(srcVectorType.getShape(), broadcastShape) .empty() && - "unexpected dim-1 broadcast"); + "unexpected \"dim-1\" broadcast"); VectorType broadcastType = VectorType::get(broadcastShape, elementType); assert(vector::isBroadcastableTo(value.getType(), broadcastType) == @@ -4023,8 +4023,8 @@ class ContiguousExtractStridedSliceToExtract final // Avoid generating slices that have leading unit dimensions. The shape_cast // op that we create below would take bad generic fallback patterns // (ShapeCastOpRewritePattern). - while (sizes[numOffsets] == 1 && - numOffsets < static_cast(sizes.size()) - 1) { + while (numOffsets < static_cast(sizes.size()) - 1 && + sizes[numOffsets] == 1) { ++numOffsets; } diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp index eea4f7fa5c4be..0fa97f1f38079 100644 --- a/mlir/lib/IR/AsmPrinter.cpp +++ b/mlir/lib/IR/AsmPrinter.cpp @@ -182,7 +182,7 @@ struct AsmPrinterOptions { llvm::cl::opt printLocalScopeOpt{ "mlir-print-local-scope", llvm::cl::init(false), llvm::cl::desc("Print with local scope and inline information (eliding " - "aliases for attributes, types, and locations")}; + "aliases for attributes, types, and locations)")}; llvm::cl::opt skipRegionsOpt{ "mlir-print-skip-regions", llvm::cl::init(false), diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt index 93032c3ce1038..ccb4cfcb7ae40 100644 --- a/mlir/lib/Target/LLVMIR/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt @@ -29,6 +29,7 @@ add_mlir_translation_library(MLIRTargetLLVMIRExport intrinsics_gen LINK_COMPONENTS + Analysis Core FrontendOpenMP TransformUtils diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index 659ab1227f113..8b13735774663 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -17,6 +17,7 @@ #include "mlir/IR/Operation.h" #include "mlir/Target/LLVMIR/ModuleTranslation.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsNVPTX.h" @@ -227,14 +228,14 @@ class NVVMDialectLLVMIRTranslationInterface } else if (attribute.getName() == NVVM::NVVMDialect::getClusterMaxBlocksAttrName()) { auto value = dyn_cast(attribute.getValue()); - generateMetadata(value.getInt(), "cluster_max_blocks"); + llvmFunc->addFnAttr("nvvm.maxclusterrank", llvm::utostr(value.getInt())); } else if (attribute.getName() == NVVM::NVVMDialect::getMinctasmAttrName()) { auto value = dyn_cast(attribute.getValue()); - generateMetadata(value.getInt(), "minctasm"); + llvmFunc->addFnAttr("nvvm.minctasm", llvm::utostr(value.getInt())); } else if (attribute.getName() == NVVM::NVVMDialect::getMaxnregAttrName()) { auto value = dyn_cast(attribute.getValue()); - generateMetadata(value.getInt(), "maxnreg"); + llvmFunc->addFnAttr("nvvm.maxnreg", llvm::utostr(value.getInt())); } else if (attribute.getName() == NVVM::NVVMDialect::getKernelFuncAttrName()) { llvmFunc->setCallingConv(llvm::CallingConv::PTX_Kernel); diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 3da47de6ac24b..5cd841ee2df91 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -38,6 +38,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Analysis/TargetFolder.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -144,7 +145,7 @@ class InstructionCapturingInserter : public llvm::IRBuilderCallbackInserter { }; using CapturingIRBuilder = - llvm::IRBuilder; + llvm::IRBuilder; } // namespace InstructionCapturingInserter::CollectionScope::CollectionScope( @@ -1171,7 +1172,9 @@ LogicalResult ModuleTranslation::convertGlobalsAndAliases() { // Convert global variable bodies. for (auto op : getModuleBody(mlirModule).getOps()) { if (Block *initializer = op.getInitializerBlock()) { - llvm::IRBuilder<> builder(llvmModule->getContext()); + llvm::IRBuilder builder( + llvmModule->getContext(), + llvm::TargetFolder(llvmModule->getDataLayout())); [[maybe_unused]] int numConstantsHit = 0; [[maybe_unused]] int numConstantsErased = 0; @@ -1282,7 +1285,9 @@ LogicalResult ModuleTranslation::convertGlobalsAndAliases() { // Convert global alias bodies. for (auto op : getModuleBody(mlirModule).getOps()) { Block &initializer = op.getInitializerBlock(); - llvm::IRBuilder<> builder(llvmModule->getContext()); + llvm::IRBuilder builder( + llvmModule->getContext(), + llvm::TargetFolder(llvmModule->getDataLayout())); for (mlir::Operation &op : initializer.without_terminator()) { if (failed(convertOperation(op, builder))) @@ -1517,7 +1522,8 @@ LogicalResult ModuleTranslation::convertOneFunction(LLVMFuncOp func) { // converted before uses. auto blocks = getBlocksSortedByDominance(func.getBody()); for (Block *bb : blocks) { - CapturingIRBuilder builder(llvmContext); + CapturingIRBuilder builder(llvmContext, + llvm::TargetFolder(llvmModule->getDataLayout())); if (failed(convertBlockImpl(*bb, bb->isEntryBlock(), builder, /*recordInsertions=*/true))) return failure(); @@ -2136,7 +2142,9 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext, LLVM::legalizeDIExpressionsRecursively(module); ModuleTranslation translator(module, std::move(llvmModule)); - llvm::IRBuilder<> llvmBuilder(llvmContext); + llvm::IRBuilder llvmBuilder( + llvmContext, + llvm::TargetFolder(translator.getLLVMModule()->getDataLayout())); // Convert module before functions and operations inside, so dialect // attributes can be used to change dialect-specific global configurations via diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi index fb7efb8cd28a5..ab975a6954044 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi @@ -46,6 +46,7 @@ import abc import collections from collections.abc import Callable, Sequence import io +from pathlib import Path from typing import Any, ClassVar, TypeVar, overload __all__ = [ @@ -2129,6 +2130,15 @@ class Module: Returns a new MlirModule or raises an MLIRError if the parsing fails. + See also: https://mlir.llvm.org/docs/LangRef/ + """ + @staticmethod + def parseFile(path: str, context: Context | None = None) -> Module: + """ + Parses a module's assembly format from file. + + Returns a new MlirModule or raises an MLIRError if the parsing fails. + See also: https://mlir.llvm.org/docs/LangRef/ """ def _CAPICreate(self) -> Any: ... diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir index 1dabacfd8a47c..7daf4ef8717bc 100644 --- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir +++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir @@ -577,12 +577,26 @@ func.func @cmpi_2dvector(%arg0 : vector<4x3xi32>, %arg1 : vector<4x3xi32>) { // ----- // CHECK-LABEL: @select +// CHECK-SAME: (%[[ARG0:.*]]: i1, %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32) func.func @select(%arg0 : i1, %arg1 : i32, %arg2 : i32) -> i32 { - // CHECK: = llvm.select %arg0, %arg1, %arg2 : i1, i32 + // CHECK: %[[RES:.*]] = llvm.select %[[ARG0]], %[[ARG1]], %[[ARG2]] : i1, i32 + // CHECK: return %[[RES]] %0 = arith.select %arg0, %arg1, %arg2 : i32 return %0 : i32 } +// CHECK-LABEL: @select_complex +// CHECK-SAME: (%[[ARG0:.*]]: i1, %[[ARG1:.*]]: complex, %[[ARG2:.*]]: complex) +func.func @select_complex(%arg0 : i1, %arg1 : complex, %arg2 : complex) -> complex { + // CHECK-DAG: %[[ARGC1:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : complex to !llvm.struct<(f32, f32)> + // CHECK-DAG: %[[ARGC2:.*]] = builtin.unrealized_conversion_cast %[[ARG2]] : complex to !llvm.struct<(f32, f32)> + // CHECK: %[[RES:.*]] = llvm.select %[[ARG0]], %[[ARGC1]], %[[ARGC2]] : i1, !llvm.struct<(f32, f32)> + // CHECK: %[[RESC:.*]] = builtin.unrealized_conversion_cast %[[RES]] : !llvm.struct<(f32, f32)> to complex + // CHECK: return %[[RESC]] + %0 = arith.select %arg0, %arg1, %arg2 : complex + return %0 : complex +} + // ----- // CHECK-LABEL: @ceildivsi @@ -727,3 +741,15 @@ func.func @ops_supporting_overflow(%arg0: i64, %arg1: i64) { %3 = arith.shli %arg0, %arg1 overflow : i64 return } + +// ----- + +// CHECK-LABEL: func @memref_bitcast +// CHECK-SAME: (%[[ARG:.*]]: memref) +// CHECK: %[[V1:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : memref to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> +// CHECK: %[[V2:.*]] = builtin.unrealized_conversion_cast %[[V1]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref +// CHECK: return %[[V2]] +func.func @memref_bitcast(%1: memref) -> memref { + %2 = arith.bitcast %1 : memref to memref + func.return %2 : memref +} diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index d8ba28a3ce887..17add2d41afe7 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -472,7 +472,8 @@ func.func @test_simple_f32(%arg0: tensor<1xf32>) -> () { // CHECK: linalg.generic // CHECK: arith.mulf - %4 = tosa.mul %0, %1 : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32> + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %4 = tosa.mul %0, %1, %shift : (tensor<1xf32>, tensor<1xf32>, tensor<1xi8>) -> tensor<1xf32> // CHECK: linalg.generic // CHECK: arith.negf @@ -529,7 +530,7 @@ func.func @test_simple_f32(%arg0: tensor<1xf32>) -> () { // CHECK: linalg.generic // CHECK: arith.minimumf // CHECK: arith.maximumf - %18 = tosa.clamp %0 {min_int = 1 : i64, max_int = 5 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xf32>) -> tensor<1xf32> + %18 = tosa.clamp %0 {min_val = 1.0 : f32, max_val = 5.0 : f32} : (tensor<1xf32>) -> tensor<1xf32> // CHECK: linalg.generic // CHECK: arith.negf @@ -618,7 +619,8 @@ func.func @test_simple_i16(%arg0: tensor<1xi16>) -> () { // CHECK: arith.extsi // CHECK: arith.extsi // CHECK: arith.muli - %0 = tosa.mul %arg0, %arg0 : (tensor<1xi16>, tensor<1xi16>) -> tensor<1xi32> + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %0 = tosa.mul %arg0, %arg0, %shift : (tensor<1xi16>, tensor<1xi16>, tensor<1xi8>) -> tensor<1xi32> return } @@ -729,35 +731,14 @@ func.func @test_simple_i32(%arg0: tensor<1xi32>, %unsigned: tensor<1xui32>, %uns // CHECK: linalg.generic // CHECK-DAG: arith.maxsi // CHECK-DAG: arith.minsi - %19 = tosa.clamp %0 {min_int = 1 : i64, max_int = 5 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xi32>) -> tensor<1xi32> + %19 = tosa.clamp %0 {min_val = 1 : i32, max_val = 5 : i32} : (tensor<1xi32>) -> tensor<1xi32> // CHECK: linalg.generic // CHECK-DAG: %[[LB:.*]] = arith.constant 4 : i32 // CHECK-DAG: %[[UB:.*]] = arith.constant 32 : i32 // CHECK-DAG: arith.maxui %[[LB]], // CHECK-DAG: arith.minui %[[UB]], - %u0 = tosa.clamp %unsigned {min_int = 4 : i64, max_int = 32 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xui32>) -> tensor<1xui32> - - // CHECK: linalg.generic - // CHECK-DAG: %[[LB:.*]] = arith.constant -1 : i32 - // CHECK-DAG: %[[UB:.*]] = arith.constant -1 : i32 - // CHECK-DAG: arith.maxui %[[LB]], - // CHECK-DAG: arith.minui %[[UB]], - %u1 = tosa.clamp %unsigned {min_int = 9223372036854775807 : i64, max_int = 9223372036854775807 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xui32>) -> tensor<1xui32> - - // CHECK: linalg.generic - // CHECK-DAG: %[[LB:.*]] = arith.constant 0 : i32 - // CHECK-DAG: %[[UB:.*]] = arith.constant 0 : i32 - // CHECK-DAG: arith.maxui %[[LB]], - // CHECK-DAG: arith.minui %[[UB]], - %u2 = tosa.clamp %unsigned {min_int = -3 : i64, max_int = -2 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xui32>) -> tensor<1xui32> - - // CHECK: linalg.generic - // CHECK-DAG: %[[LB:.*]] = arith.constant 0 : i64 - // CHECK-DAG: %[[UB:.*]] = arith.constant 9223372036854775807 : i64 - // CHECK-DAG: arith.maxui %[[LB]], - // CHECK-DAG: arith.minui %[[UB]], - %u3 = tosa.clamp %unsigned64 {min_int = -3 : i64, max_int = 9223372036854775807 : i64, min_fp = 1.0 : f32, max_fp = 5.0 : f32} : (tensor<1xui64>) -> tensor<1xui64> + %u0 = tosa.clamp %unsigned {min_val = 4 : ui32, max_val = 32 : ui32} : (tensor<1xui32>) -> tensor<1xui32> // CHECK: linalg.generic // CHECK: arith.trunci @@ -807,15 +788,7 @@ func.func @test_i8(%arg0: tensor<1xi8>) -> () { // CHECK-DAG: %[[C126:.+]] = arith.constant 126 // CHECK-DAG: %[[LOWER:.+]] = arith.maxsi %[[C127]], %[[ARG1]] // CHECK-DAG: %[[CLAMPED:.+]] = arith.minsi %[[C126]], %[[LOWER]] - %0 = tosa.clamp %arg0 {min_int = -127 : i64, max_int = 126 : i64, min_fp = 0.0 : f32, max_fp = 0.0 : f32} : (tensor<1xi8>) -> tensor<1xi8> - - // CHECK: linalg.generic - // CHECK: ^bb0(%[[ARG1:.+]]: i8, - // CHECK-DAG: %[[C128:.+]] = arith.constant -128 - // CHECK-DAG: %[[C127:.+]] = arith.constant 127 - // CHECK-DAG: %[[LOWER:.+]] = arith.maxsi %[[C128]], %[[ARG1]] - // CHECK-DAG: %[[CLAMPED:.+]] = arith.minsi %[[C127]], %[[LOWER]] - %1 = tosa.clamp %arg0 {min_int = -130 : i64, max_int = 130 : i64, min_fp = 0.0 : f32, max_fp = 0.0 : f32} : (tensor<1xi8>) -> tensor<1xi8> + %0 = tosa.clamp %arg0 {min_val = -127 : i8, max_val = 126 : i8} : (tensor<1xi8>) -> tensor<1xi8> return } @@ -830,7 +803,7 @@ func.func @test_i64(%arg0: tensor<1xi64>) -> () { // CHECK-DAG: %[[C126:.+]] = arith.constant 9223372036854775807 // CHECK-DAG: %[[LOWER:.+]] = arith.maxsi %[[C127]], %[[ARG1]] // CHECK-DAG: %[[CLAMPED:.+]] = arith.minsi %[[C126]], %[[LOWER]] - %0 = tosa.clamp %arg0 {min_int = -9223372036854775808 : i64, max_int = 9223372036854775807 : i64, min_fp = 0.0 : f32, max_fp = 0.0 : f32} : (tensor<1xi64>) -> tensor<1xi64> + %0 = tosa.clamp %arg0 {min_val = -9223372036854775808 : i64, max_val = 9223372036854775807 : i64} : (tensor<1xi64>) -> tensor<1xi64> return } @@ -845,7 +818,7 @@ func.func @test_clamp_f16(%arg0: tensor<1xf16>) -> () { // CHECK-DAG: %[[C6:.+]] = arith.constant 6.0 // CHECK-DAG: %[[MIN:.+]] = arith.minimumf %[[ARG1]], %[[C6]] // CHECK-DAG: %[[MAX:.+]] = arith.maximumf %[[MIN]], %[[C0]] - %0 = tosa.clamp %arg0 {min_int = 0 : i64, max_int = 0 : i64, min_fp = 0.0 : f32, max_fp = 6.0 : f32} : (tensor<1xf16>) -> tensor<1xf16> + %0 = tosa.clamp %arg0 {min_val = 0.0 : f16, max_val = 6.0 : f16} : (tensor<1xf16>) -> tensor<1xf16> return } diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir index d261327ec005f..fa7c030538401 100644 --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir @@ -530,22 +530,22 @@ func.func @extract_scalar_from_vec_0d_index(%arg0: vector) -> index { // ----- -func.func @extract_scalar_from_vec_2d_f32_dynamic_idxs_compile_time_const(%arg : vector<32x1xi32>) -> i32 { +func.func @extract_scalar_from_vec_2d_f32_dynamic_idxs_compile_time_const(%arg : vector<32x1xf32>) -> f32 { %0 = arith.constant 0 : index - %1 = vector.extract %arg[%0, %0] : i32 from vector<32x1xi32> - return %1 : i32 + %1 = vector.extract %arg[%0, %0] : f32 from vector<32x1xf32> + return %1 : f32 } // At compile time, since the indices of extractOp are constants, // they will be collapsed and folded away; therefore, the lowering works. // CHECK-LABEL: @extract_scalar_from_vec_2d_f32_dynamic_idxs_compile_time_const -// CHECK-SAME: %[[ARG:.*]]: vector<32x1xi32>) -> i32 { -// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : vector<32x1xi32> to !llvm.array<32 x vector<1xi32>> -// CHECK: %[[VEC_0:.*]] = llvm.extractvalue %[[CAST]][0] : !llvm.array<32 x vector<1xi32>> +// CHECK-SAME: %[[ARG:.*]]: vector<32x1xf32>) -> f32 { +// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : vector<32x1xf32> to !llvm.array<32 x vector<1xf32>> +// CHECK: %[[VEC_0:.*]] = llvm.extractvalue %[[CAST]][0] : !llvm.array<32 x vector<1xf32>> // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[RES:.*]] = llvm.extractelement %[[VEC_0]]{{\[}}%[[C0]] : i64] : vector<1xi32> -// CHECK: return %[[RES]] : i32 +// CHECK: %[[RES:.*]] = llvm.extractelement %[[VEC_0]]{{\[}}%[[C0]] : i64] : vector<1xf32> +// CHECK: return %[[RES]] : f32 // ----- @@ -800,26 +800,26 @@ func.func @insert_scalar_into_vec_2d_f32_dynamic_idx_scalable(%arg0: vector<1x[1 // ----- -func.func @insert_scalar_from_vec_2d_f32_dynamic_idxs_compile_time_const(%arg : vector<4x1xi32>) -> vector<4x1xi32> { +func.func @insert_scalar_from_vec_2d_f32_dynamic_idxs_compile_time_const(%arg : vector<4x1xf32>) -> vector<4x1xf32> { %0 = arith.constant 0 : index - %1 = arith.constant 1 : i32 - %res = vector.insert %1, %arg[%0, %0] : i32 into vector<4x1xi32> - return %res : vector<4x1xi32> + %1 = arith.constant 1.0 : f32 + %res = vector.insert %1, %arg[%0, %0] : f32 into vector<4x1xf32> + return %res : vector<4x1xf32> } // At compile time, since the indices of insertOp are constants, // they will be collapsed and folded away; therefore, the lowering works. // CHECK-LABEL: @insert_scalar_from_vec_2d_f32_dynamic_idxs_compile_time_const -// CHECK-SAME: %[[ARG:.*]]: vector<4x1xi32>) -> vector<4x1xi32> { -// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : vector<4x1xi32> to !llvm.array<4 x vector<1xi32>> -// CHECK: %[[C1:.*]] = arith.constant 1 : i32 -// CHECK: %[[VEC_0:.*]] = llvm.extractvalue %[[CAST]][0] : !llvm.array<4 x vector<1xi32>> +// CHECK-SAME: %[[ARG:.*]]: vector<4x1xf32>) -> vector<4x1xf32> { +// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : vector<4x1xf32> to !llvm.array<4 x vector<1xf32>> +// CHECK: %[[C1:.*]] = arith.constant 1.000000e+00 : f32 +// CHECK: %[[VEC_0:.*]] = llvm.extractvalue %[[CAST]][0] : !llvm.array<4 x vector<1xf32>> // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[VEC_1:.*]] = llvm.insertelement %[[C1]], %[[VEC_0]]{{\[}}%[[C0]] : i64] : vector<1xi32> -// CHECK: %[[VEC_2:.*]] = llvm.insertvalue %[[VEC_1]], %[[CAST]][0] : !llvm.array<4 x vector<1xi32>> -// CHECK: %[[RES:.*]] = builtin.unrealized_conversion_cast %[[VEC_2]] : !llvm.array<4 x vector<1xi32>> to vector<4x1xi32> -// CHECK: return %[[RES]] : vector<4x1xi32> +// CHECK: %[[VEC_1:.*]] = llvm.insertelement %[[C1]], %[[VEC_0]]{{\[}}%[[C0]] : i64] : vector<1xf32> +// CHECK: %[[VEC_2:.*]] = llvm.insertvalue %[[VEC_1]], %[[CAST]][0] : !llvm.array<4 x vector<1xf32>> +// CHECK: %[[RES:.*]] = builtin.unrealized_conversion_cast %[[VEC_2]] : !llvm.array<4 x vector<1xf32>> to vector<4x1xf32> +// CHECK: return %[[RES]] : vector<4x1xf32> // ----- diff --git a/mlir/test/Dialect/Affine/loop-fusion-2.mlir b/mlir/test/Dialect/Affine/loop-fusion-2.mlir index 8fec24f71b14a..99207e4910462 100644 --- a/mlir/test/Dialect/Affine/loop-fusion-2.mlir +++ b/mlir/test/Dialect/Affine/loop-fusion-2.mlir @@ -389,6 +389,8 @@ func.func @should_fuse_init_loops_siblings_then_shared_producer(%arg0: memref<10 // ----- +// Test sibling fusion of two matrix-vector products sharing the input matrix. + func.func @two_matrix_vector_products() { %in_matrix = memref.alloc() : memref<10x10xf32> %in_vec0 = memref.alloc() : memref<10xf32> diff --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir index 2830235431c76..788d7f9470530 100644 --- a/mlir/test/Dialect/Affine/loop-fusion-4.mlir +++ b/mlir/test/Dialect/Affine/loop-fusion-4.mlir @@ -1,5 +1,8 @@ // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer fusion-maximal}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER-MAXIMAL // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal mode=sibling}))' -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL +// All fusion: producer-consumer and sibling. +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s --check-prefix=ALL // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(spirv.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=SPIRV // Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir. @@ -108,6 +111,7 @@ func.func @check_src_dst_step(%m : memref<100xf32>, func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) { %cst_0 = arith.constant 0.000000e+00 : f32 %cst_1 = arith.constant 1.000000e+00 : f32 + // This nest writes to %arg1 but can be eliminated post sibling fusion. affine.for %arg3 = 0 to 1 { affine.for %arg4 = 0 to 64 { %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 { @@ -137,11 +141,11 @@ func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : m // since the destination loop and source loop trip counts do not // match. // SIBLING-MAXIMAL: %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32 -// SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32 -// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_0:.*]]= 0 to 1 { -// SIBLING-MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 { -// SIBLING-MAXIMAL-NEXT: %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) { -// SIBLING-MAXIMAL-NEXT: %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) { +// SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32 +// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 1 { +// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 64 { +// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 32 iter_args(%{{.*}} = %[[cst_1]]) -> (f32) { +// SIBLING-MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 64 iter_args(%{{.*}} = %[[cst_0]]) -> (f32) { // ----- @@ -315,11 +319,16 @@ func.func @same_memref_load_store(%producer : memref<32xf32>, %consumer: memref< return } +// ----- + // PRODUCER-CONSUMER-LABEL: func @same_memref_load_multiple_stores +// ALL-LABEL: func @same_memref_load_multiple_stores func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %producer_2 : memref<32xf32>, %consumer: memref<16xf32>){ %cst = arith.constant 2.000000e+00 : f32 - // Source isn't removed. + // Ensure that source isn't removed during both producer-consumer fusion and + // sibling fusion. // PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 32 + // ALL: affine.for %{{.*}} = 0 to 32 affine.for %arg3 = 0 to 32 { %0 = affine.load %producer[%arg3] : memref<32xf32> %2 = arith.mulf %0, %cst : f32 @@ -343,5 +352,42 @@ func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %produce // PRODUCER-CONSUMER-NEXT: arith.addf // PRODUCER-CONSUMER-NEXT: affine.store // PRODUCER-CONSUMER-NEXT: } + // ALL: affine.for %{{.*}} = 0 to 16 + // ALL: mulf + // ALL: addf + return +} + +#map = affine_map<()[s0] -> (s0 + 5)> +#map1 = affine_map<()[s0] -> (s0 + 17)> + +// Test with non-int/float memref types. + +// PRODUCER-CONSUMER-MAXIMAL-LABEL: func @memref_index_type +func.func @memref_index_type() { + %0 = llvm.mlir.constant(2 : index) : i64 + %2 = llvm.mlir.constant(0 : index) : i64 + %3 = builtin.unrealized_conversion_cast %2 : i64 to index + %alloc = memref.alloc() {alignment = 64 : i64} : memref<8x18xf32> + %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<3xf32> + %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<3xindex> + affine.for %arg3 = 0 to 3 { + %4 = affine.load %alloc_2[%arg3] : memref<3xindex> + %5 = builtin.unrealized_conversion_cast %4 : index to i64 + %6 = llvm.sub %0, %5 : i64 + %7 = builtin.unrealized_conversion_cast %6 : i64 to index + affine.store %7, %alloc_2[%arg3] : memref<3xindex> + } + affine.for %arg3 = 0 to 3 { + %4 = affine.load %alloc_2[%arg3] : memref<3xindex> + %5 = affine.apply #map()[%4] + %6 = affine.apply #map1()[%3] + %7 = memref.load %alloc[%5, %6] : memref<8x18xf32> + affine.store %7, %alloc_1[%arg3] : memref<3xf32> + } + // Expect fusion. + // PRODUCER-CONSUMER-MAXIMAL: affine.for + // PRODUCER-CONSUMER-MAXIMAL-NOT: affine.for + // PRODUCER-CONSUMER-MAXIMAL: return return } diff --git a/mlir/test/Dialect/Affine/loop-fusion.mlir b/mlir/test/Dialect/Affine/loop-fusion.mlir index 1c119e87c5336..dcd2e1cdb275a 100644 --- a/mlir/test/Dialect/Affine/loop-fusion.mlir +++ b/mlir/test/Dialect/Affine/loop-fusion.mlir @@ -1206,6 +1206,9 @@ func.func @should_fuse_with_private_memref() { // CHECK: affine.for %{{.*}} = 0 to 17 { // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> + // CHECK-NEXT: } + // CHECK: affine.for %{{.*}} = 0 to 82 { + // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> // CHECK-NEXT: } // CHECK-NEXT: return diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index e398c3fe2011d..574e9f41494af 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -1,8 +1,9 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-full" | FileCheck %s --check-prefix UNROLL-FULL -// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-full unroll-full-threshold=2" | FileCheck %s --check-prefix SHORT -// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-factor=4" | FileCheck %s --check-prefix UNROLL-BY-4 -// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-factor=1" | FileCheck %s --check-prefix UNROLL-BY-1 -// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-factor=5 cleanup-unroll=true" | FileCheck %s --check-prefix UNROLL-CLEANUP-LOOP +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true}))" | FileCheck %s --check-prefix UNROLL-FULL +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true unroll-full-threshold=2}))" | FileCheck %s --check-prefix SHORT +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=4}))" | FileCheck %s --check-prefix UNROLL-BY-4 +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=1}))" | FileCheck %s --check-prefix UNROLL-BY-1 +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=5 cleanup-unroll=true}))" | FileCheck %s --check-prefix UNROLL-CLEANUP-LOOP +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(gpu.module(gpu.func(affine-loop-unroll{unroll-full=true})))" | FileCheck %s --check-prefix GPU-UNROLL-FULL // UNROLL-FULL-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)> // UNROLL-FULL-DAG: [[$MAP1:#map[0-9]*]] = affine_map<(d0) -> (d0 + 2)> @@ -240,6 +241,23 @@ func.func @loop_nest_unroll_full() { return } // UNROLL-FULL } +gpu.module @unroll_full { + // GPU-UNROLL-FULL-LABEL: func @gpu_loop_nest_simplest() { + gpu.func @gpu_loop_nest_simplest() { + // GPU-UNROLL-FULL: affine.for %arg0 = 0 to 100 step 2 { + affine.for %i = 0 to 100 step 2 { + // GPU-UNROLL-FULL: %c1_i32 = arith.constant 1 : i32 + // GPU-UNROLL-FULL-NEXT: %c1_i32_0 = arith.constant 1 : i32 + // GPU-UNROLL-FULL-NEXT: %c1_i32_1 = arith.constant 1 : i32 + // GPU-UNROLL-FULL-NEXT: %c1_i32_2 = arith.constant 1 : i32 + affine.for %j = 0 to 4 { + %x = arith.constant 1 : i32 + } + } // GPU-UNROLL-FULL: } + gpu.return // GPU-UNROLL-FULL: return + } +} + // SHORT-LABEL: func @loop_nest_outer_unroll() { func.func @loop_nest_outer_unroll() { // SHORT: affine.for %arg0 = 0 to 4 { diff --git a/mlir/test/Dialect/Arith/mesh-spmdize.mlir b/mlir/test/Dialect/Arith/mesh-spmdize.mlir new file mode 100644 index 0000000000000..6b55dd533a92c --- /dev/null +++ b/mlir/test/Dialect/Arith/mesh-spmdize.mlir @@ -0,0 +1,17 @@ +// RUN: mlir-opt \ +// RUN: --pass-pipeline="builtin.module(func.func(mesh-spmdization))" \ +// RUN: %s | FileCheck %s + +mesh.mesh @mesh4x4(shape = 4x4) + +// CHECK-LABEL: func @test_spmdize_constant +// CHECK-NEXT: [[vcst:%.*]] = arith.constant dense<0.000000e+00> : +// tensor<256x1024xf32> CHECK-NEXT: [[vc434_i32:%.*]] = arith.constant 434 : +// i32 CHECK-NEXT: return [[vcst]] : tensor<256x1024xf32> +func.func @test_spmdize_constant() ->(tensor<1024x1024xf32>)attributes{llvm.emit_c_interface} { + %cst = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + %sharding_1 = mesh.sharding @mesh4x4 split_axes = [[0]] : !mesh.sharding + %sharding_annotated_1 = mesh.shard %cst to %sharding_1 : tensor<1024x1024xf32> + %ci = arith.constant 434 : i32 + return %sharding_annotated_1 : tensor<1024x1024xf32> +} diff --git a/mlir/test/Dialect/Arith/sharding-propagation.mlir b/mlir/test/Dialect/Arith/sharding-propagation.mlir new file mode 100644 index 0000000000000..19eb340549b0b --- /dev/null +++ b/mlir/test/Dialect/Arith/sharding-propagation.mlir @@ -0,0 +1,54 @@ +// RUN: mlir-opt --pass-pipeline="builtin.module(func.func(sharding-propagation))" %s | FileCheck %s + +mesh.mesh @mesh4x4(shape = 4x4) + +// CHECK-LABEL: func.func @test_shard_constant() -> tensor<1024x1024xf32> attributes {llvm.emit_c_interface} { +// CHECK-NEXT: [[vcst:%.*]] = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> +// CHECK-NEXT: [[vsharding:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding +// CHECK-NEXT: [[vsharding_annotated:%.*]] = mesh.shard [[vcst]] to [[vsharding]] : tensor<1024x1024xf32> +// CHECK-NEXT: [[vcst_0:%.*]] = arith.constant 4.340000e+01 : f32 +// CHECK-NEXT: [[v0:%.*]] = tensor.empty() : tensor<1024x1024xf32> +// CHECK-NEXT: [[vsharding_1:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding +// CHECK-NEXT: [[vsharding_annotated_2:%.*]] = mesh.shard [[v0]] to [[vsharding_1]] : tensor<1024x1024xf32> +// CHECK-NEXT: [[vsharding_3:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding +// CHECK-NEXT: [[vsharding_annotated_4:%.*]] = mesh.shard [[vsharding_annotated]] to [[vsharding_3]] annotate_for_users : tensor<1024x1024xf32> +// CHECK-NEXT: [[vsharding_5:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding +// CHECK-NEXT: [[vsharding_annotated_6:%.*]] = mesh.shard [[vsharding_annotated_2]] to [[vsharding_5]] annotate_for_users : tensor<1024x1024xf32> +// CHECK-NEXT: [[v1:%.*]] = linalg.add ins([[vsharding_annotated_4]], [[vcst_0]] : tensor<1024x1024xf32>, f32) outs([[vsharding_annotated_6]] : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> +// CHECK-NEXT: [[vsharding_7:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding +// CHECK-NEXT: [[vsharding_annotated_8:%.*]] = mesh.shard [[v1]] to [[vsharding_7]] : tensor<1024x1024xf32> +// CHECK-NEXT: return [[vsharding_annotated_8]] : tensor<1024x1024xf32> +func.func @test_shard_constant() -> (tensor<1024x1024xf32>) attributes {llvm.emit_c_interface} { + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + %sharding_1 = mesh.sharding @mesh4x4 split_axes = [[0]] : !mesh.sharding + %sharding_annotated_1 = mesh.shard %cst_1 to %sharding_1 : tensor<1024x1024xf32> + %ci = arith.constant 43.4e+00 : f32 + %o1 = tensor.empty() : tensor<1024x1024xf32> + %res = linalg.add ins(%sharding_annotated_1, %ci : tensor<1024x1024xf32>, f32) outs(%o1 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + return %res : tensor<1024x1024xf32> +} + +// CHECK-LABEL: func.func @test_shard_constant_back() -> tensor<1024x1024xf32> attributes {llvm.emit_c_interface} { +// CHECK-NEXT: [[vcst:%.*]] = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> +// CHECK-NEXT: [[vsharding:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding +// CHECK-NEXT: [[vsharding_annotated:%.*]] = mesh.shard [[vcst]] to [[vsharding]] : tensor<1024x1024xf32> +// CHECK-NEXT: [[vcst_0:%.*]] = arith.constant 4.340000e+01 : f32 +// CHECK-NEXT: [[v0:%.*]] = tensor.empty() : tensor<1024x1024xf32> +// CHECK-NEXT: [[vsharding_1:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding +// CHECK-NEXT: [[vsharding_annotated_2:%.*]] = mesh.shard [[v0]] to [[vsharding_1]] : tensor<1024x1024xf32> +// CHECK-NEXT: [[vsharding_3:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding +// CHECK-NEXT: [[vsharding_annotated_4:%.*]] = mesh.shard [[vsharding_annotated]] to [[vsharding_3]] annotate_for_users : tensor<1024x1024xf32> +// CHECK-NEXT: [[vsharding_5:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding +// CHECK-NEXT: [[vsharding_annotated_6:%.*]] = mesh.shard [[vsharding_annotated_2]] to [[vsharding_5]] annotate_for_users : tensor<1024x1024xf32> +// CHECK-NEXT: [[v1:%.*]] = linalg.add ins([[vsharding_annotated_4]], [[vcst_0]] : tensor<1024x1024xf32>, f32) outs([[vsharding_annotated_6]] : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> +// CHECK-NEXT: [[vsharding_7:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding +// CHECK-NEXT: [[vsharding_annotated_8:%.*]] = mesh.shard [[v1]] to [[vsharding_7]] : tensor<1024x1024xf32> +func.func @test_shard_constant_back() -> (tensor<1024x1024xf32>) attributes {llvm.emit_c_interface} { + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + %ci = arith.constant 43.4e+00 : f32 + %o1 = tensor.empty() : tensor<1024x1024xf32> + %res = linalg.add ins(%cst_1, %ci : tensor<1024x1024xf32>, f32) outs(%o1 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + %sharding_1 = mesh.sharding @mesh4x4 split_axes = [[0]] : !mesh.sharding + %sharding_annotated_1 = mesh.shard %res to %sharding_1 : tensor<1024x1024xf32> + return %sharding_annotated_1 : tensor<1024x1024xf32> +} diff --git a/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation.mlir b/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation.mlir deleted file mode 100644 index 3fbe3913c6549..0000000000000 --- a/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation.mlir +++ /dev/null @@ -1,1462 +0,0 @@ -// RUN: mlir-opt -verify-diagnostics -buffer-deallocation -split-input-file %s | FileCheck %s - -// This file checks the behaviour of BufferDeallocation pass for moving and -// inserting missing DeallocOps in their correct positions. Furthermore, -// copies and their corresponding AllocOps are inserted. - -// Test Case: -// bb0 -// / \ -// bb1 bb2 <- Initial position of AllocOp -// \ / -// bb3 -// BufferDeallocation expected behavior: bb2 contains an AllocOp which is -// passed to bb3. In the latter block, there should be an deallocation. -// Since bb1 does not contain an adequate alloc and the alloc in bb2 is not -// moved to bb0, we need to insert allocs and copies. - -// CHECK-LABEL: func @condBranch -func.func @condBranch(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - cf.cond_br %arg0, ^bb1, ^bb2 -^bb1: - cf.br ^bb3(%arg1 : memref<2xf32>) -^bb2: - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) - cf.br ^bb3(%0 : memref<2xf32>) -^bb3(%1: memref<2xf32>): - test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: cf.cond_br -// CHECK: %[[ALLOC0:.*]] = bufferization.clone -// CHECK-NEXT: cf.br ^bb3(%[[ALLOC0]] -// CHECK: %[[ALLOC1:.*]] = memref.alloc -// CHECK-NEXT: test.buffer_based -// CHECK-NEXT: %[[ALLOC2:.*]] = bufferization.clone %[[ALLOC1]] -// CHECK-NEXT: memref.dealloc %[[ALLOC1]] -// CHECK-NEXT: cf.br ^bb3(%[[ALLOC2]] -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc -// CHECK-NEXT: return - -// ----- - -// Test Case: -// bb0 -// / \ -// bb1 bb2 <- Initial position of AllocOp -// \ / -// bb3 -// BufferDeallocation expected behavior: The existing AllocOp has a dynamic -// dependency to block argument %0 in bb2. Since the dynamic type is passed -// to bb3 via the block argument %2, it is currently required to allocate a -// temporary buffer for %2 that gets copies of %arg0 and %1 with their -// appropriate shape dimensions. The copy buffer deallocation will be applied -// to %2 in block bb3. - -// CHECK-LABEL: func @condBranchDynamicType -func.func @condBranchDynamicType( - %arg0: i1, - %arg1: memref, - %arg2: memref, - %arg3: index) { - cf.cond_br %arg0, ^bb1, ^bb2(%arg3: index) -^bb1: - cf.br ^bb3(%arg1 : memref) -^bb2(%0: index): - %1 = memref.alloc(%0) : memref - test.buffer_based in(%arg1: memref) out(%1: memref) - cf.br ^bb3(%1 : memref) -^bb3(%2: memref): - test.copy(%2, %arg2) : (memref, memref) - return -} - -// CHECK-NEXT: cf.cond_br -// CHECK: %[[ALLOC0:.*]] = bufferization.clone -// CHECK-NEXT: cf.br ^bb3(%[[ALLOC0]] -// CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) -// CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc(%[[IDX]]) -// CHECK-NEXT: test.buffer_based -// CHECK-NEXT: %[[ALLOC2:.*]] = bufferization.clone -// CHECK-NEXT: memref.dealloc %[[ALLOC1]] -// CHECK-NEXT: cf.br ^bb3 -// CHECK-NEXT: ^bb3(%[[ALLOC3:.*]]:{{.*}}) -// CHECK: test.copy(%[[ALLOC3]], -// CHECK-NEXT: memref.dealloc %[[ALLOC3]] -// CHECK-NEXT: return - -// ----- - -// Test case: See above. - -// CHECK-LABEL: func @condBranchUnrankedType -func.func @condBranchUnrankedType( - %arg0: i1, - %arg1: memref<*xf32>, - %arg2: memref<*xf32>, - %arg3: index) { - cf.cond_br %arg0, ^bb1, ^bb2(%arg3: index) -^bb1: - cf.br ^bb3(%arg1 : memref<*xf32>) -^bb2(%0: index): - %1 = memref.alloc(%0) : memref - %2 = memref.cast %1 : memref to memref<*xf32> - test.buffer_based in(%arg1: memref<*xf32>) out(%2: memref<*xf32>) - cf.br ^bb3(%2 : memref<*xf32>) -^bb3(%3: memref<*xf32>): - test.copy(%3, %arg2) : (memref<*xf32>, memref<*xf32>) - return -} - -// CHECK-NEXT: cf.cond_br -// CHECK: %[[ALLOC0:.*]] = bufferization.clone -// CHECK-NEXT: cf.br ^bb3(%[[ALLOC0]] -// CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) -// CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc(%[[IDX]]) -// CHECK: test.buffer_based -// CHECK-NEXT: %[[ALLOC2:.*]] = bufferization.clone -// CHECK-NEXT: memref.dealloc %[[ALLOC1]] -// CHECK-NEXT: cf.br ^bb3 -// CHECK-NEXT: ^bb3(%[[ALLOC3:.*]]:{{.*}}) -// CHECK: test.copy(%[[ALLOC3]], -// CHECK-NEXT: memref.dealloc %[[ALLOC3]] -// CHECK-NEXT: return - -// ----- - -// Test Case: -// bb0 -// / \ -// bb1 bb2 <- Initial position of AllocOp -// | / \ -// | bb3 bb4 -// | \ / -// \ bb5 -// \ / -// bb6 -// | -// bb7 -// BufferDeallocation expected behavior: The existing AllocOp has a dynamic -// dependency to block argument %0 in bb2. Since the dynamic type is passed to -// bb5 via the block argument %2 and to bb6 via block argument %3, it is -// currently required to allocate temporary buffers for %2 and %3 that gets -// copies of %1 and %arg0 1 with their appropriate shape dimensions. The copy -// buffer deallocations will be applied to %2 in block bb5 and to %3 in block -// bb6. Furthermore, there should be no copy inserted for %4. - -// CHECK-LABEL: func @condBranchDynamicTypeNested -func.func @condBranchDynamicTypeNested( - %arg0: i1, - %arg1: memref, - %arg2: memref, - %arg3: index) { - cf.cond_br %arg0, ^bb1, ^bb2(%arg3: index) -^bb1: - cf.br ^bb6(%arg1 : memref) -^bb2(%0: index): - %1 = memref.alloc(%0) : memref - test.buffer_based in(%arg1: memref) out(%1: memref) - cf.cond_br %arg0, ^bb3, ^bb4 -^bb3: - cf.br ^bb5(%1 : memref) -^bb4: - cf.br ^bb5(%1 : memref) -^bb5(%2: memref): - cf.br ^bb6(%2 : memref) -^bb6(%3: memref): - cf.br ^bb7(%3 : memref) -^bb7(%4: memref): - test.copy(%4, %arg2) : (memref, memref) - return -} - -// CHECK-NEXT: cf.cond_br{{.*}} -// CHECK-NEXT: ^bb1 -// CHECK-NEXT: %[[ALLOC0:.*]] = bufferization.clone -// CHECK-NEXT: cf.br ^bb6(%[[ALLOC0]] -// CHECK: ^bb2(%[[IDX:.*]]:{{.*}}) -// CHECK-NEXT: %[[ALLOC1:.*]] = memref.alloc(%[[IDX]]) -// CHECK-NEXT: test.buffer_based -// CHECK: cf.cond_br -// CHECK: ^bb3: -// CHECK-NEXT: cf.br ^bb5(%[[ALLOC1]]{{.*}}) -// CHECK: ^bb4: -// CHECK-NEXT: cf.br ^bb5(%[[ALLOC1]]{{.*}}) -// CHECK-NEXT: ^bb5(%[[ALLOC2:.*]]:{{.*}}) -// CHECK-NEXT: %[[ALLOC3:.*]] = bufferization.clone %[[ALLOC2]] -// CHECK-NEXT: memref.dealloc %[[ALLOC1]] -// CHECK-NEXT: cf.br ^bb6(%[[ALLOC3]]{{.*}}) -// CHECK-NEXT: ^bb6(%[[ALLOC4:.*]]:{{.*}}) -// CHECK-NEXT: cf.br ^bb7(%[[ALLOC4]]{{.*}}) -// CHECK-NEXT: ^bb7(%[[ALLOC5:.*]]:{{.*}}) -// CHECK: test.copy(%[[ALLOC5]], -// CHECK-NEXT: memref.dealloc %[[ALLOC4]] -// CHECK-NEXT: return - -// ----- - -// Test Case: Existing AllocOp with no users. -// BufferDeallocation expected behavior: It should insert a DeallocOp right -// before ReturnOp. - -// CHECK-LABEL: func @emptyUsesValue -func.func @emptyUsesValue(%arg0: memref<4xf32>) { - %0 = memref.alloc() : memref<4xf32> - return -} -// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: memref.dealloc %[[ALLOC]] -// CHECK-NEXT: return - -// ----- - -// Test Case: -// bb0 -// / \ -// | bb1 <- Initial position of AllocOp -// \ / -// bb2 -// BufferDeallocation expected behavior: It should insert a DeallocOp at the -// exit block after CopyOp since %1 is an alias for %0 and %arg1. Furthermore, -// we have to insert a copy and an alloc in the beginning of the function. - -// CHECK-LABEL: func @criticalEdge -func.func @criticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - cf.cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>) -^bb1: - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) - cf.br ^bb2(%0 : memref<2xf32>) -^bb2(%1: memref<2xf32>): - test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: %[[ALLOC0:.*]] = bufferization.clone -// CHECK-NEXT: cf.cond_br -// CHECK: %[[ALLOC1:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based -// CHECK-NEXT: %[[ALLOC2:.*]] = bufferization.clone %[[ALLOC1]] -// CHECK-NEXT: memref.dealloc %[[ALLOC1]] -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc -// CHECK-NEXT: return - -// ----- - -// Test Case: -// bb0 <- Initial position of AllocOp -// / \ -// | bb1 -// \ / -// bb2 -// BufferDeallocation expected behavior: It only inserts a DeallocOp at the -// exit block after CopyOp since %1 is an alias for %0 and %arg1. - -// CHECK-LABEL: func @invCriticalEdge -func.func @invCriticalEdge(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) - cf.cond_br %arg0, ^bb1, ^bb2(%arg1 : memref<2xf32>) -^bb1: - cf.br ^bb2(%0 : memref<2xf32>) -^bb2(%1: memref<2xf32>): - test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK: dealloc -// CHECK-NEXT: return - -// ----- - -// Test Case: -// bb0 <- Initial position of the first AllocOp -// / \ -// bb1 bb2 -// \ / -// bb3 <- Initial position of the second AllocOp -// BufferDeallocation expected behavior: It only inserts two missing -// DeallocOps in the exit block. %5 is an alias for %0. Therefore, the -// DeallocOp for %0 should occur after the last BufferBasedOp. The Dealloc for -// %7 should happen after CopyOp. - -// CHECK-LABEL: func @ifElse -func.func @ifElse(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) - cf.cond_br %arg0, - ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), - ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) -^bb1(%1: memref<2xf32>, %2: memref<2xf32>): - cf.br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>) -^bb2(%3: memref<2xf32>, %4: memref<2xf32>): - cf.br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>) -^bb3(%5: memref<2xf32>, %6: memref<2xf32>): - %7 = memref.alloc() : memref<2xf32> - test.buffer_based in(%5: memref<2xf32>) out(%7: memref<2xf32>) - test.copy(%7, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based -// CHECK: %[[SECOND_ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based -// CHECK: memref.dealloc %[[FIRST_ALLOC]] -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc %[[SECOND_ALLOC]] -// CHECK-NEXT: return - -// ----- - -// Test Case: No users for buffer in if-else CFG -// bb0 <- Initial position of AllocOp -// / \ -// bb1 bb2 -// \ / -// bb3 -// BufferDeallocation expected behavior: It only inserts a missing DeallocOp -// in the exit block since %5 or %6 are the latest aliases of %0. - -// CHECK-LABEL: func @ifElseNoUsers -func.func @ifElseNoUsers(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) - cf.cond_br %arg0, - ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), - ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) -^bb1(%1: memref<2xf32>, %2: memref<2xf32>): - cf.br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>) -^bb2(%3: memref<2xf32>, %4: memref<2xf32>): - cf.br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>) -^bb3(%5: memref<2xf32>, %6: memref<2xf32>): - test.copy(%arg1, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = memref.alloc() -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc %[[FIRST_ALLOC]] -// CHECK-NEXT: return - -// ----- - -// Test Case: -// bb0 <- Initial position of the first AllocOp -// / \ -// bb1 bb2 -// | / \ -// | bb3 bb4 -// \ \ / -// \ / -// bb5 <- Initial position of the second AllocOp -// BufferDeallocation expected behavior: Two missing DeallocOps should be -// inserted in the exit block. - -// CHECK-LABEL: func @ifElseNested -func.func @ifElseNested(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) - cf.cond_br %arg0, - ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), - ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) -^bb1(%1: memref<2xf32>, %2: memref<2xf32>): - cf.br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>) -^bb2(%3: memref<2xf32>, %4: memref<2xf32>): - cf.cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>) -^bb3(%5: memref<2xf32>): - cf.br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>) -^bb4(%6: memref<2xf32>): - cf.br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>) -^bb5(%7: memref<2xf32>, %8: memref<2xf32>): - %9 = memref.alloc() : memref<2xf32> - test.buffer_based in(%7: memref<2xf32>) out(%9: memref<2xf32>) - test.copy(%9, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based -// CHECK: %[[SECOND_ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based -// CHECK: memref.dealloc %[[FIRST_ALLOC]] -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc %[[SECOND_ALLOC]] -// CHECK-NEXT: return - -// ----- - -// Test Case: Dead operations in a single block. -// BufferDeallocation expected behavior: It only inserts the two missing -// DeallocOps after the last BufferBasedOp. - -// CHECK-LABEL: func @redundantOperations -func.func @redundantOperations(%arg0: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg0: memref<2xf32>) out(%0: memref<2xf32>) - %1 = memref.alloc() : memref<2xf32> - test.buffer_based in(%0: memref<2xf32>) out(%1: memref<2xf32>) - return -} - -// CHECK: (%[[ARG0:.*]]: {{.*}}) -// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based in(%[[ARG0]]{{.*}}out(%[[FIRST_ALLOC]] -// CHECK: %[[SECOND_ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based in(%[[FIRST_ALLOC]]{{.*}}out(%[[SECOND_ALLOC]] -// CHECK: dealloc -// CHECK-NEXT: dealloc -// CHECK-NEXT: return - -// ----- - -// Test Case: -// bb0 -// / \ -// Initial pos of the 1st AllocOp -> bb1 bb2 <- Initial pos of the 2nd AllocOp -// \ / -// bb3 -// BufferDeallocation expected behavior: We need to introduce a copy for each -// buffer since the buffers are passed to bb3. The both missing DeallocOps are -// inserted in the respective block of the allocs. The copy is freed in the exit -// block. - -// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc -func.func @moving_alloc_and_inserting_missing_dealloc( - %cond: i1, - %arg0: memref<2xf32>, - %arg1: memref<2xf32>) { - cf.cond_br %cond, ^bb1, ^bb2 -^bb1: - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg0: memref<2xf32>) out(%0: memref<2xf32>) - cf.br ^exit(%0 : memref<2xf32>) -^bb2: - %1 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg0: memref<2xf32>) out(%1: memref<2xf32>) - cf.br ^exit(%1 : memref<2xf32>) -^exit(%arg2: memref<2xf32>): - test.copy(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: cf.cond_br{{.*}} -// CHECK-NEXT: ^bb1 -// CHECK: %[[ALLOC0:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based -// CHECK-NEXT: %[[ALLOC1:.*]] = bufferization.clone %[[ALLOC0]] -// CHECK-NEXT: memref.dealloc %[[ALLOC0]] -// CHECK-NEXT: cf.br ^bb3(%[[ALLOC1]] -// CHECK-NEXT: ^bb2 -// CHECK-NEXT: %[[ALLOC2:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based -// CHECK-NEXT: %[[ALLOC3:.*]] = bufferization.clone %[[ALLOC2]] -// CHECK-NEXT: memref.dealloc %[[ALLOC2]] -// CHECK-NEXT: cf.br ^bb3(%[[ALLOC3]] -// CHECK-NEXT: ^bb3(%[[ALLOC4:.*]]:{{.*}}) -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc %[[ALLOC4]] -// CHECK-NEXT: return - -// ----- - -// Test Case: Invalid position of the DeallocOp. There is a user after -// deallocation. -// bb0 -// / \ -// bb1 bb2 <- Initial position of AllocOp -// \ / -// bb3 -// BufferDeallocation expected behavior: The existing DeallocOp should be -// moved to exit block. - -// CHECK-LABEL: func @moving_invalid_dealloc_op_complex -func.func @moving_invalid_dealloc_op_complex( - %cond: i1, - %arg0: memref<2xf32>, - %arg1: memref<2xf32>) { - %1 = memref.alloc() : memref<2xf32> - cf.cond_br %cond, ^bb1, ^bb2 -^bb1: - cf.br ^exit(%arg0 : memref<2xf32>) -^bb2: - test.buffer_based in(%arg0: memref<2xf32>) out(%1: memref<2xf32>) - memref.dealloc %1 : memref<2xf32> - cf.br ^exit(%1 : memref<2xf32>) -^exit(%arg2: memref<2xf32>): - test.copy(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc() -// CHECK-NEXT: cf.cond_br -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc %[[ALLOC0]] -// CHECK-NEXT: return - -// ----- - -// Test Case: Inserting missing DeallocOp in a single block. - -// CHECK-LABEL: func @inserting_missing_dealloc_simple -func.func @inserting_missing_dealloc_simple( - %arg0 : memref<2xf32>, - %arg1: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg0: memref<2xf32>) out(%0: memref<2xf32>) - test.copy(%0, %arg1) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc() -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc %[[ALLOC0]] - -// ----- - -// Test Case: Moving invalid DeallocOp (there is a user after deallocation) in a -// single block. - -// CHECK-LABEL: func @moving_invalid_dealloc_op -func.func @moving_invalid_dealloc_op(%arg0 : memref<2xf32>, %arg1: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg0: memref<2xf32>) out(%0: memref<2xf32>) - memref.dealloc %0 : memref<2xf32> - test.copy(%0, %arg1) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc() -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc %[[ALLOC0]] - -// ----- - -// Test Case: Nested regions - This test defines a BufferBasedOp inside the -// region of a RegionBufferBasedOp. -// BufferDeallocation expected behavior: The AllocOp for the BufferBasedOp -// should remain inside the region of the RegionBufferBasedOp and it should insert -// the missing DeallocOp in the same region. The missing DeallocOp should be -// inserted after CopyOp. - -// CHECK-LABEL: func @nested_regions_and_cond_branch -func.func @nested_regions_and_cond_branch( - %arg0: i1, - %arg1: memref<2xf32>, - %arg2: memref<2xf32>) { - cf.cond_br %arg0, ^bb1, ^bb2 -^bb1: - cf.br ^bb3(%arg1 : memref<2xf32>) -^bb2: - %0 = memref.alloc() : memref<2xf32> - test.region_buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) { - ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): - %1 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%1: memref<2xf32>) - %tmp1 = math.exp %gen1_arg0 : f32 - test.region_yield %tmp1 : f32 - } - cf.br ^bb3(%0 : memref<2xf32>) -^bb3(%1: memref<2xf32>): - test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} -// CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}}) -// CHECK-NEXT: cf.cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]] -// CHECK: %[[ALLOC0:.*]] = bufferization.clone %[[ARG1]] -// CHECK: ^[[BB2]]: -// CHECK: %[[ALLOC1:.*]] = memref.alloc() -// CHECK-NEXT: test.region_buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOC1]] -// CHECK: %[[ALLOC2:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOC2]] -// CHECK: memref.dealloc %[[ALLOC2]] -// CHECK-NEXT: %{{.*}} = math.exp -// CHECK: %[[ALLOC3:.*]] = bufferization.clone %[[ALLOC1]] -// CHECK-NEXT: memref.dealloc %[[ALLOC1]] -// CHECK: ^[[BB3:.*]]({{.*}}): -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc - -// ----- - -// Test Case: buffer deallocation escaping -// BufferDeallocation expected behavior: It must not dealloc %arg1 and %x -// since they are operands of return operation and should escape from -// deallocating. It should dealloc %y after CopyOp. - -// CHECK-LABEL: func @memref_in_function_results -func.func @memref_in_function_results( - %arg0: memref<5xf32>, - %arg1: memref<10xf32>, - %arg2: memref<5xf32>) -> (memref<10xf32>, memref<15xf32>) { - %x = memref.alloc() : memref<15xf32> - %y = memref.alloc() : memref<5xf32> - test.buffer_based in(%arg0: memref<5xf32>) out(%y: memref<5xf32>) - test.copy(%y, %arg2) : (memref<5xf32>, memref<5xf32>) - return %arg1, %x : memref<10xf32>, memref<15xf32> -} -// CHECK: (%[[ARG0:.*]]: memref<5xf32>, %[[ARG1:.*]]: memref<10xf32>, -// CHECK-SAME: %[[RESULT:.*]]: memref<5xf32>) -// CHECK: %[[X:.*]] = memref.alloc() -// CHECK: %[[Y:.*]] = memref.alloc() -// CHECK: test.copy -// CHECK: memref.dealloc %[[Y]] -// CHECK: return %[[ARG1]], %[[X]] - -// ----- - -// Test Case: nested region control flow -// The alloc %1 flows through both if branches until it is finally returned. -// Hence, it does not require a specific dealloc operation. However, %3 -// requires a dealloc. - -// CHECK-LABEL: func @nested_region_control_flow -func.func @nested_region_control_flow( - %arg0 : index, - %arg1 : index) -> memref { - %0 = arith.cmpi eq, %arg0, %arg1 : index - %1 = memref.alloc(%arg0, %arg0) : memref - %2 = scf.if %0 -> (memref) { - scf.yield %1 : memref - } else { - %3 = memref.alloc(%arg0, %arg1) : memref - scf.yield %1 : memref - } - return %2 : memref -} - -// CHECK: %[[ALLOC0:.*]] = memref.alloc(%arg0, %arg0) -// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if -// CHECK: scf.yield %[[ALLOC0]] -// CHECK: %[[ALLOC2:.*]] = memref.alloc(%arg0, %arg1) -// CHECK-NEXT: memref.dealloc %[[ALLOC2]] -// CHECK-NEXT: scf.yield %[[ALLOC0]] -// CHECK: return %[[ALLOC1]] - -// ----- - -// Test Case: nested region control flow with a nested buffer allocation in a -// divergent branch. -// Buffer deallocation places a copy for both %1 and %3, since they are -// returned in the end. - -// CHECK-LABEL: func @nested_region_control_flow_div -func.func @nested_region_control_flow_div( - %arg0 : index, - %arg1 : index) -> memref { - %0 = arith.cmpi eq, %arg0, %arg1 : index - %1 = memref.alloc(%arg0, %arg0) : memref - %2 = scf.if %0 -> (memref) { - scf.yield %1 : memref - } else { - %3 = memref.alloc(%arg0, %arg1) : memref - scf.yield %3 : memref - } - return %2 : memref -} - -// CHECK: %[[ALLOC0:.*]] = memref.alloc(%arg0, %arg0) -// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if -// CHECK-NEXT: %[[ALLOC2:.*]] = bufferization.clone %[[ALLOC0]] -// CHECK: scf.yield %[[ALLOC2]] -// CHECK: %[[ALLOC3:.*]] = memref.alloc(%arg0, %arg1) -// CHECK-NEXT: %[[ALLOC4:.*]] = bufferization.clone %[[ALLOC3]] -// CHECK: memref.dealloc %[[ALLOC3]] -// CHECK: scf.yield %[[ALLOC4]] -// CHECK: memref.dealloc %[[ALLOC0]] -// CHECK-NEXT: return %[[ALLOC1]] - -// ----- - -// Test Case: nested region control flow within a region interface. -// No copies are required in this case since the allocation finally escapes -// the method. - -// CHECK-LABEL: func @inner_region_control_flow -func.func @inner_region_control_flow(%arg0 : index) -> memref { - %0 = memref.alloc(%arg0, %arg0) : memref - %1 = test.region_if %0 : memref -> (memref) then { - ^bb0(%arg1 : memref): - test.region_if_yield %arg1 : memref - } else { - ^bb0(%arg1 : memref): - test.region_if_yield %arg1 : memref - } join { - ^bb0(%arg1 : memref): - test.region_if_yield %arg1 : memref - } - return %1 : memref -} - -// CHECK: %[[ALLOC0:.*]] = memref.alloc(%arg0, %arg0) -// CHECK-NEXT: %[[ALLOC1:.*]] = test.region_if -// CHECK-NEXT: ^bb0(%[[ALLOC2:.*]]:{{.*}}): -// CHECK-NEXT: test.region_if_yield %[[ALLOC2]] -// CHECK: ^bb0(%[[ALLOC3:.*]]:{{.*}}): -// CHECK-NEXT: test.region_if_yield %[[ALLOC3]] -// CHECK: ^bb0(%[[ALLOC4:.*]]:{{.*}}): -// CHECK-NEXT: test.region_if_yield %[[ALLOC4]] -// CHECK: return %[[ALLOC1]] - -// ----- - -// CHECK-LABEL: func @subview -func.func @subview(%arg0 : index, %arg1 : index, %arg2 : memref) { - %0 = memref.alloc() : memref<64x4xf32, strided<[4, 1], offset: 0>> - %1 = memref.subview %0[%arg0, %arg1][%arg0, %arg1][%arg0, %arg1] : - memref<64x4xf32, strided<[4, 1], offset: 0>> - to memref> - test.copy(%1, %arg2) : - (memref>, memref) - return -} - -// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: memref.subview -// CHECK-NEXT: test.copy -// CHECK-NEXT: memref.dealloc %[[ALLOC]] -// CHECK-NEXT: return - -// ----- - -// Test Case: In the presence of AllocaOps only the AllocOps has top be freed. -// Therefore, all allocas are not handled. - -// CHECK-LABEL: func @condBranchAlloca -func.func @condBranchAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - cf.cond_br %arg0, ^bb1, ^bb2 -^bb1: - cf.br ^bb3(%arg1 : memref<2xf32>) -^bb2: - %0 = memref.alloca() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) - cf.br ^bb3(%0 : memref<2xf32>) -^bb3(%1: memref<2xf32>): - test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: cf.cond_br -// CHECK: %[[ALLOCA:.*]] = memref.alloca() -// CHECK: cf.br ^bb3(%[[ALLOCA:.*]]) -// CHECK-NEXT: ^bb3 -// CHECK-NEXT: test.copy -// CHECK-NEXT: return - -// ----- - -// Test Case: In the presence of AllocaOps only the AllocOps has top be freed. -// Therefore, all allocas are not handled. In this case, only alloc %0 has a -// dealloc. - -// CHECK-LABEL: func @ifElseAlloca -func.func @ifElseAlloca(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) - cf.cond_br %arg0, - ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), - ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) -^bb1(%1: memref<2xf32>, %2: memref<2xf32>): - cf.br ^bb3(%1, %2 : memref<2xf32>, memref<2xf32>) -^bb2(%3: memref<2xf32>, %4: memref<2xf32>): - cf.br ^bb3(%3, %4 : memref<2xf32>, memref<2xf32>) -^bb3(%5: memref<2xf32>, %6: memref<2xf32>): - %7 = memref.alloca() : memref<2xf32> - test.buffer_based in(%5: memref<2xf32>) out(%7: memref<2xf32>) - test.copy(%7, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based -// CHECK: %[[ALLOCA:.*]] = memref.alloca() -// CHECK-NEXT: test.buffer_based -// CHECK: memref.dealloc %[[ALLOC]] -// CHECK: test.copy -// CHECK-NEXT: return - -// ----- - -// CHECK-LABEL: func @ifElseNestedAlloca -func.func @ifElseNestedAlloca( - %arg0: i1, - %arg1: memref<2xf32>, - %arg2: memref<2xf32>) { - %0 = memref.alloca() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) - cf.cond_br %arg0, - ^bb1(%arg1, %0 : memref<2xf32>, memref<2xf32>), - ^bb2(%0, %arg1 : memref<2xf32>, memref<2xf32>) -^bb1(%1: memref<2xf32>, %2: memref<2xf32>): - cf.br ^bb5(%1, %2 : memref<2xf32>, memref<2xf32>) -^bb2(%3: memref<2xf32>, %4: memref<2xf32>): - cf.cond_br %arg0, ^bb3(%3 : memref<2xf32>), ^bb4(%4 : memref<2xf32>) -^bb3(%5: memref<2xf32>): - cf.br ^bb5(%5, %3 : memref<2xf32>, memref<2xf32>) -^bb4(%6: memref<2xf32>): - cf.br ^bb5(%3, %6 : memref<2xf32>, memref<2xf32>) -^bb5(%7: memref<2xf32>, %8: memref<2xf32>): - %9 = memref.alloc() : memref<2xf32> - test.buffer_based in(%7: memref<2xf32>) out(%9: memref<2xf32>) - test.copy(%9, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() -// CHECK-NEXT: test.buffer_based -// CHECK: %[[ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: test.buffer_based -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc %[[ALLOC]] -// CHECK-NEXT: return - -// ----- - -// CHECK-LABEL: func @nestedRegionsAndCondBranchAlloca -func.func @nestedRegionsAndCondBranchAlloca( - %arg0: i1, - %arg1: memref<2xf32>, - %arg2: memref<2xf32>) { - cf.cond_br %arg0, ^bb1, ^bb2 -^bb1: - cf.br ^bb3(%arg1 : memref<2xf32>) -^bb2: - %0 = memref.alloc() : memref<2xf32> - test.region_buffer_based in(%arg1: memref<2xf32>) out(%0: memref<2xf32>) { - ^bb0(%gen1_arg0: f32, %gen1_arg1: f32): - %1 = memref.alloca() : memref<2xf32> - test.buffer_based in(%arg1: memref<2xf32>) out(%1: memref<2xf32>) - %tmp1 = math.exp %gen1_arg0 : f32 - test.region_yield %tmp1 : f32 - } - cf.br ^bb3(%0 : memref<2xf32>) -^bb3(%1: memref<2xf32>): - test.copy(%1, %arg2) : (memref<2xf32>, memref<2xf32>) - return -} -// CHECK: (%[[cond:.*]]: {{.*}}, %[[ARG1:.*]]: {{.*}}, %{{.*}}: {{.*}}) -// CHECK-NEXT: cf.cond_br %[[cond]], ^[[BB1:.*]], ^[[BB2:.*]] -// CHECK: ^[[BB1]]: -// CHECK: %[[ALLOC0:.*]] = bufferization.clone -// CHECK: ^[[BB2]]: -// CHECK: %[[ALLOC1:.*]] = memref.alloc() -// CHECK-NEXT: test.region_buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOC1]] -// CHECK: %[[ALLOCA:.*]] = memref.alloca() -// CHECK-NEXT: test.buffer_based in(%[[ARG1]]{{.*}}out(%[[ALLOCA]] -// CHECK: %{{.*}} = math.exp -// CHECK: %[[ALLOC2:.*]] = bufferization.clone %[[ALLOC1]] -// CHECK-NEXT: memref.dealloc %[[ALLOC1]] -// CHECK: ^[[BB3:.*]]({{.*}}): -// CHECK: test.copy -// CHECK-NEXT: memref.dealloc - -// ----- - -// CHECK-LABEL: func @nestedRegionControlFlowAlloca -func.func @nestedRegionControlFlowAlloca( - %arg0 : index, - %arg1 : index) -> memref { - %0 = arith.cmpi eq, %arg0, %arg1 : index - %1 = memref.alloc(%arg0, %arg0) : memref - %2 = scf.if %0 -> (memref) { - scf.yield %1 : memref - } else { - %3 = memref.alloca(%arg0, %arg1) : memref - scf.yield %1 : memref - } - return %2 : memref -} - -// CHECK: %[[ALLOC0:.*]] = memref.alloc(%arg0, %arg0) -// CHECK-NEXT: %[[ALLOC1:.*]] = scf.if -// CHECK: scf.yield %[[ALLOC0]] -// CHECK: %[[ALLOCA:.*]] = memref.alloca(%arg0, %arg1) -// CHECK-NEXT: scf.yield %[[ALLOC0]] -// CHECK: return %[[ALLOC1]] - -// ----- - -// Test Case: structured control-flow loop using a nested alloc. -// The iteration argument %iterBuf has to be freed before yielding %3 to avoid -// memory leaks. - -// CHECK-LABEL: func @loop_alloc -func.func @loop_alloc( - %lb: index, - %ub: index, - %step: index, - %buf: memref<2xf32>, - %res: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - %1 = scf.for %i = %lb to %ub step %step - iter_args(%iterBuf = %buf) -> memref<2xf32> { - %2 = arith.cmpi eq, %i, %ub : index - %3 = memref.alloc() : memref<2xf32> - scf.yield %3 : memref<2xf32> - } - test.copy(%1, %res) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK: %[[ALLOC0:.*]] = memref.alloc() -// CHECK-NEXT: memref.dealloc %[[ALLOC0]] -// CHECK-NEXT: %[[ALLOC1:.*]] = bufferization.clone %arg3 -// CHECK: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args -// CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]] -// CHECK: arith.cmpi -// CHECK: memref.dealloc %[[IALLOC]] -// CHECK: %[[ALLOC3:.*]] = memref.alloc() -// CHECK: %[[ALLOC4:.*]] = bufferization.clone %[[ALLOC3]] -// CHECK: memref.dealloc %[[ALLOC3]] -// CHECK: scf.yield %[[ALLOC4]] -// CHECK: } -// CHECK: test.copy(%[[ALLOC2]], %arg4) -// CHECK-NEXT: memref.dealloc %[[ALLOC2]] - -// ----- - -// Test Case: structured control-flow loop with a nested if operation. -// The loop yields buffers that have been defined outside of the loop and the -// backedges only use the iteration arguments (or one of its aliases). -// Therefore, we do not have to (and are not allowed to) free any buffers -// that are passed via the backedges. - -// CHECK-LABEL: func @loop_nested_if_no_alloc -func.func @loop_nested_if_no_alloc( - %lb: index, - %ub: index, - %step: index, - %buf: memref<2xf32>, - %res: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - %1 = scf.for %i = %lb to %ub step %step - iter_args(%iterBuf = %buf) -> memref<2xf32> { - %2 = arith.cmpi eq, %i, %ub : index - %3 = scf.if %2 -> (memref<2xf32>) { - scf.yield %0 : memref<2xf32> - } else { - scf.yield %iterBuf : memref<2xf32> - } - scf.yield %3 : memref<2xf32> - } - test.copy(%1, %res) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK: %[[ALLOC0:.*]] = memref.alloc() -// CHECK-NEXT: %[[ALLOC1:.*]] = scf.for {{.*}} iter_args(%[[IALLOC:.*]] = -// CHECK: %[[ALLOC2:.*]] = scf.if -// CHECK: scf.yield %[[ALLOC0]] -// CHECK: scf.yield %[[IALLOC]] -// CHECK: scf.yield %[[ALLOC2]] -// CHECK: test.copy(%[[ALLOC1]], %arg4) -// CHECK: memref.dealloc %[[ALLOC0]] - -// ----- - -// Test Case: structured control-flow loop with a nested if operation using -// a deeply nested buffer allocation. -// Since the innermost allocation happens in a divergent branch, we have to -// introduce additional copies for the nested if operation. Since the loop's -// yield operation "returns" %3, it will return a newly allocated buffer. -// Therefore, we have to free the iteration argument %iterBuf before -// "returning" %3. - -// CHECK-LABEL: func @loop_nested_if_alloc -func.func @loop_nested_if_alloc( - %lb: index, - %ub: index, - %step: index, - %buf: memref<2xf32>) -> memref<2xf32> { - %0 = memref.alloc() : memref<2xf32> - %1 = scf.for %i = %lb to %ub step %step - iter_args(%iterBuf = %buf) -> memref<2xf32> { - %2 = arith.cmpi eq, %i, %ub : index - %3 = scf.if %2 -> (memref<2xf32>) { - %4 = memref.alloc() : memref<2xf32> - scf.yield %4 : memref<2xf32> - } else { - scf.yield %0 : memref<2xf32> - } - scf.yield %3 : memref<2xf32> - } - return %1 : memref<2xf32> -} - -// CHECK: %[[ALLOC0:.*]] = memref.alloc() -// CHECK-NEXT: %[[ALLOC1:.*]] = bufferization.clone %arg3 -// CHECK-NEXT: %[[ALLOC2:.*]] = scf.for {{.*}} iter_args -// CHECK-SAME: (%[[IALLOC:.*]] = %[[ALLOC1]] -// CHECK: memref.dealloc %[[IALLOC]] -// CHECK: %[[ALLOC3:.*]] = scf.if - -// CHECK: %[[ALLOC4:.*]] = memref.alloc() -// CHECK-NEXT: %[[ALLOC5:.*]] = bufferization.clone %[[ALLOC4]] -// CHECK-NEXT: memref.dealloc %[[ALLOC4]] -// CHECK-NEXT: scf.yield %[[ALLOC5]] - -// CHECK: %[[ALLOC6:.*]] = bufferization.clone %[[ALLOC0]] -// CHECK-NEXT: scf.yield %[[ALLOC6]] - -// CHECK: %[[ALLOC7:.*]] = bufferization.clone %[[ALLOC3]] -// CHECK-NEXT: memref.dealloc %[[ALLOC3]] -// CHECK-NEXT: scf.yield %[[ALLOC7]] - -// CHECK: memref.dealloc %[[ALLOC0]] -// CHECK-NEXT: return %[[ALLOC2]] - -// ----- - -// Test Case: several nested structured control-flow loops with a deeply nested -// buffer allocation inside an if operation. -// Same behavior is an loop_nested_if_alloc: we have to insert deallocations -// before each yield in all loops recursively. - -// CHECK-LABEL: func @loop_nested_alloc -func.func @loop_nested_alloc( - %lb: index, - %ub: index, - %step: index, - %buf: memref<2xf32>, - %res: memref<2xf32>) { - %0 = memref.alloc() : memref<2xf32> - %1 = scf.for %i = %lb to %ub step %step - iter_args(%iterBuf = %buf) -> memref<2xf32> { - %2 = scf.for %i2 = %lb to %ub step %step - iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> { - %3 = scf.for %i3 = %lb to %ub step %step - iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> { - %4 = memref.alloc() : memref<2xf32> - %5 = arith.cmpi eq, %i, %ub : index - %6 = scf.if %5 -> (memref<2xf32>) { - %7 = memref.alloc() : memref<2xf32> - scf.yield %7 : memref<2xf32> - } else { - scf.yield %iterBuf3 : memref<2xf32> - } - scf.yield %6 : memref<2xf32> - } - scf.yield %3 : memref<2xf32> - } - scf.yield %2 : memref<2xf32> - } - test.copy(%1, %res) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK: %[[ALLOC0:.*]] = memref.alloc() -// CHECK-NEXT: memref.dealloc %[[ALLOC0]] -// CHECK-NEXT: %[[ALLOC1:.*]] = bufferization.clone %arg3 -// CHECK-NEXT: %[[VAL_7:.*]] = scf.for {{.*}} iter_args -// CHECK-SAME: (%[[IALLOC0:.*]] = %[[ALLOC1]]) -// CHECK-NEXT: %[[ALLOC2:.*]] = bufferization.clone %[[IALLOC0]] -// CHECK-NEXT: memref.dealloc %[[IALLOC0]] -// CHECK-NEXT: %[[ALLOC3:.*]] = scf.for {{.*}} iter_args -// CHECK-SAME: (%[[IALLOC1:.*]] = %[[ALLOC2]]) -// CHECK-NEXT: %[[ALLOC5:.*]] = bufferization.clone %[[IALLOC1]] -// CHECK-NEXT: memref.dealloc %[[IALLOC1]] - -// CHECK: %[[ALLOC6:.*]] = scf.for {{.*}} iter_args -// CHECK-SAME: (%[[IALLOC2:.*]] = %[[ALLOC5]]) -// CHECK: %[[ALLOC8:.*]] = memref.alloc() -// CHECK-NEXT: memref.dealloc %[[ALLOC8]] -// CHECK: %[[ALLOC9:.*]] = scf.if - -// CHECK: %[[ALLOC11:.*]] = memref.alloc() -// CHECK-NEXT: %[[ALLOC12:.*]] = bufferization.clone %[[ALLOC11]] -// CHECK-NEXT: memref.dealloc %[[ALLOC11]] -// CHECK-NEXT: scf.yield %[[ALLOC12]] - -// CHECK: %[[ALLOC13:.*]] = bufferization.clone %[[IALLOC2]] -// CHECK-NEXT: scf.yield %[[ALLOC13]] - -// CHECK: memref.dealloc %[[IALLOC2]] -// CHECK-NEXT: %[[ALLOC10:.*]] = bufferization.clone %[[ALLOC9]] -// CHECK-NEXT: memref.dealloc %[[ALLOC9]] -// CHECK-NEXT: scf.yield %[[ALLOC10]] - -// CHECK: %[[ALLOC7:.*]] = bufferization.clone %[[ALLOC6]] -// CHECK-NEXT: memref.dealloc %[[ALLOC6]] -// CHECK-NEXT: scf.yield %[[ALLOC7]] - -// CHECK: %[[ALLOC4:.*]] = bufferization.clone %[[ALLOC3]] -// CHECK-NEXT: memref.dealloc %[[ALLOC3]] -// CHECK-NEXT: scf.yield %[[ALLOC4]] - -// CHECK: test.copy(%[[VAL_7]], %arg4) -// CHECK-NEXT: memref.dealloc %[[VAL_7]] - -// ----- - -// CHECK-LABEL: func @affine_loop -func.func @affine_loop() { - %buffer = memref.alloc() : memref<1024xf32> - %sum_init_0 = arith.constant 0.0 : f32 - %res = affine.for %i = 0 to 10 step 2 iter_args(%sum_iter = %sum_init_0) -> f32 { - %t = affine.load %buffer[%i] : memref<1024xf32> - %sum_next = arith.addf %sum_iter, %t : f32 - affine.yield %sum_next : f32 - } - // CHECK: %[[M:.*]] = memref.alloc - // CHECK: affine.for - // CHECK: } - // CHECK-NEXT: memref.dealloc %[[M]] - return -} - -// ----- - -// Test Case: explicit control-flow loop with a dynamically allocated buffer. -// The BufferDeallocation transformation should fail on this explicit -// control-flow loop since they are not supported. - -// expected-error@+1 {{Only structured control-flow loops are supported}} -func.func @loop_dynalloc( - %arg0 : i32, - %arg1 : i32, - %arg2: memref, - %arg3: memref) { - %const0 = arith.constant 0 : i32 - cf.br ^loopHeader(%const0, %arg2 : i32, memref) - -^loopHeader(%i : i32, %buff : memref): - %lessThan = arith.cmpi slt, %i, %arg1 : i32 - cf.cond_br %lessThan, - ^loopBody(%i, %buff : i32, memref), - ^exit(%buff : memref) - -^loopBody(%val : i32, %buff2: memref): - %const1 = arith.constant 1 : i32 - %inc = arith.addi %val, %const1 : i32 - %size = arith.index_cast %inc : i32 to index - %alloc1 = memref.alloc(%size) : memref - cf.br ^loopHeader(%inc, %alloc1 : i32, memref) - -^exit(%buff3 : memref): - test.copy(%buff3, %arg3) : (memref, memref) - return -} - -// ----- - -// Test Case: explicit control-flow loop with a dynamically allocated buffer. -// The BufferDeallocation transformation should fail on this explicit -// control-flow loop since they are not supported. - -// expected-error@+1 {{Only structured control-flow loops are supported}} -func.func @do_loop_alloc( - %arg0 : i32, - %arg1 : i32, - %arg2: memref<2xf32>, - %arg3: memref<2xf32>) { - %const0 = arith.constant 0 : i32 - cf.br ^loopBody(%const0, %arg2 : i32, memref<2xf32>) - -^loopBody(%val : i32, %buff2: memref<2xf32>): - %const1 = arith.constant 1 : i32 - %inc = arith.addi %val, %const1 : i32 - %alloc1 = memref.alloc() : memref<2xf32> - cf.br ^loopHeader(%inc, %alloc1 : i32, memref<2xf32>) - -^loopHeader(%i : i32, %buff : memref<2xf32>): - %lessThan = arith.cmpi slt, %i, %arg1 : i32 - cf.cond_br %lessThan, - ^loopBody(%i, %buff : i32, memref<2xf32>), - ^exit(%buff : memref<2xf32>) - -^exit(%buff3 : memref<2xf32>): - test.copy(%buff3, %arg3) : (memref<2xf32>, memref<2xf32>) - return -} - -// ----- - -// CHECK-LABEL: func @assumingOp( -func.func @assumingOp( - %arg0: !shape.witness, - %arg2: memref<2xf32>, - %arg3: memref<2xf32>) { - // Confirm the alloc will be dealloc'ed in the block. - %1 = shape.assuming %arg0 -> memref<2xf32> { - %0 = memref.alloc() : memref<2xf32> - shape.assuming_yield %arg2 : memref<2xf32> - } - // Confirm the alloc will be returned and dealloc'ed after its use. - %3 = shape.assuming %arg0 -> memref<2xf32> { - %2 = memref.alloc() : memref<2xf32> - shape.assuming_yield %2 : memref<2xf32> - } - test.copy(%3, %arg3) : (memref<2xf32>, memref<2xf32>) - return -} - -// CHECK-SAME: %[[ARG0:.*]]: !shape.witness, -// CHECK-SAME: %[[ARG1:.*]]: {{.*}}, -// CHECK-SAME: %[[ARG2:.*]]: {{.*}} -// CHECK: %[[UNUSED_RESULT:.*]] = shape.assuming %[[ARG0]] -// CHECK-NEXT: %[[ALLOC0:.*]] = memref.alloc() -// CHECK-NEXT: memref.dealloc %[[ALLOC0]] -// CHECK-NEXT: shape.assuming_yield %[[ARG1]] -// CHECK: %[[ASSUMING_RESULT:.*]] = shape.assuming %[[ARG0]] -// CHECK-NEXT: %[[TMP_ALLOC:.*]] = memref.alloc() -// CHECK-NEXT: %[[RETURNING_ALLOC:.*]] = bufferization.clone %[[TMP_ALLOC]] -// CHECK-NEXT: memref.dealloc %[[TMP_ALLOC]] -// CHECK-NEXT: shape.assuming_yield %[[RETURNING_ALLOC]] -// CHECK: test.copy(%[[ASSUMING_RESULT:.*]], %[[ARG2]]) -// CHECK-NEXT: memref.dealloc %[[ASSUMING_RESULT]] - -// ----- - -// Test Case: The op "test.bar" does not implement the RegionBranchOpInterface. -// This is not allowed in buffer deallocation. - -func.func @noRegionBranchOpInterface() { -// expected-error@+1 {{All operations with attached regions need to implement the RegionBranchOpInterface.}} - %0 = "test.bar"() ({ -// expected-error@+1 {{All operations with attached regions need to implement the RegionBranchOpInterface.}} - %1 = "test.bar"() ({ - "test.yield"() : () -> () - }) : () -> (i32) - "test.yield"() : () -> () - }) : () -> (i32) - "test.terminator"() : () -> () -} - -// ----- - -// CHECK-LABEL: func @dealloc_existing_clones -// CHECK: (%[[ARG0:.*]]: memref, %[[ARG1:.*]]: memref) -// CHECK: %[[RES0:.*]] = bufferization.clone %[[ARG0]] -// CHECK: %[[RES1:.*]] = bufferization.clone %[[ARG1]] -// CHECK-NOT: memref.dealloc %[[RES0]] -// CHECK: memref.dealloc %[[RES1]] -// CHECK: return %[[RES0]] -func.func @dealloc_existing_clones(%arg0: memref, %arg1: memref) -> memref { - %0 = bufferization.clone %arg0 : memref to memref - %1 = bufferization.clone %arg1 : memref to memref - return %0 : memref -} - -// ----- - -// CHECK-LABEL: func @while_two_arg -func.func @while_two_arg(%arg0: index) { - %a = memref.alloc(%arg0) : memref -// CHECK: %[[WHILE:.*]]:2 = scf.while (%[[ARG1:.*]] = %[[ALLOC:.*]], %[[ARG2:.*]] = %[[CLONE:.*]]) - scf.while (%arg1 = %a, %arg2 = %a) : (memref, memref) -> (memref, memref) { -// CHECK-NEXT: make_condition - %0 = "test.make_condition"() : () -> i1 -// CHECK-NEXT: bufferization.clone %[[ARG2]] -// CHECK-NEXT: memref.dealloc %[[ARG2]] - scf.condition(%0) %arg1, %arg2 : memref, memref - } do { - ^bb0(%arg1: memref, %arg2: memref): -// CHECK: %[[ALLOC2:.*]] = memref.alloc - %b = memref.alloc(%arg0) : memref -// CHECK: memref.dealloc %[[ARG2]] -// CHECK: %[[CLONE2:.*]] = bufferization.clone %[[ALLOC2]] -// CHECK: memref.dealloc %[[ALLOC2]] - scf.yield %arg1, %b : memref, memref - } -// CHECK: } -// CHECK-NEXT: memref.dealloc %[[WHILE]]#1 -// CHECK-NEXT: memref.dealloc %[[ALLOC]] -// CHECK-NEXT: return - return -} - -// ----- - -func.func @while_three_arg(%arg0: index) { -// CHECK: %[[ALLOC:.*]] = memref.alloc - %a = memref.alloc(%arg0) : memref -// CHECK-NEXT: %[[CLONE1:.*]] = bufferization.clone %[[ALLOC]] -// CHECK-NEXT: %[[CLONE2:.*]] = bufferization.clone %[[ALLOC]] -// CHECK-NEXT: %[[CLONE3:.*]] = bufferization.clone %[[ALLOC]] -// CHECK-NEXT: memref.dealloc %[[ALLOC]] -// CHECK-NEXT: %[[WHILE:.*]]:3 = scf.while -// FIXME: This is non-deterministic -// CHECK-SAME-DAG: [[CLONE1]] -// CHECK-SAME-DAG: [[CLONE2]] -// CHECK-SAME-DAG: [[CLONE3]] - scf.while (%arg1 = %a, %arg2 = %a, %arg3 = %a) : (memref, memref, memref) -> (memref, memref, memref) { - %0 = "test.make_condition"() : () -> i1 - scf.condition(%0) %arg1, %arg2, %arg3 : memref, memref, memref - } do { - ^bb0(%arg1: memref, %arg2: memref, %arg3: memref): - %b = memref.alloc(%arg0) : memref - %q = memref.alloc(%arg0) : memref - scf.yield %q, %b, %arg2: memref, memref, memref - } -// CHECK-DAG: memref.dealloc %[[WHILE]]#0 -// CHECK-DAG: memref.dealloc %[[WHILE]]#1 -// CHECK-DAG: memref.dealloc %[[WHILE]]#2 -// CHECK-NEXT: return - return -} - -// ----- - -func.func @select_aliases(%arg0: index, %arg1: memref, %arg2: i1) { - // CHECK: memref.alloc - // CHECK: memref.alloc - // CHECK: arith.select - // CHECK: test.copy - // CHECK: memref.dealloc - // CHECK: memref.dealloc - %0 = memref.alloc(%arg0) : memref - %1 = memref.alloc(%arg0) : memref - %2 = arith.select %arg2, %0, %1 : memref - test.copy(%2, %arg1) : (memref, memref) - return -} - -// ----- - -func.func @f(%arg0: memref) -> memref { - return %arg0 : memref -} - -// CHECK-LABEL: func @function_call -// CHECK: memref.alloc -// CHECK: memref.alloc -// CHECK: call -// CHECK: test.copy -// CHECK: memref.dealloc -// CHECK: memref.dealloc -func.func @function_call() { - %alloc = memref.alloc() : memref - %alloc2 = memref.alloc() : memref - %ret = call @f(%alloc) : (memref) -> memref - test.copy(%ret, %alloc2) : (memref, memref) - return -} - -// ----- - -// Memref allocated in `then` region and passed back to the parent if op. -#set = affine_set<() : (0 >= 0)> -// CHECK-LABEL: func @test_affine_if_1 -// CHECK-SAME: %[[ARG0:.*]]: memref<10xf32>) -> memref<10xf32> { -func.func @test_affine_if_1(%arg0: memref<10xf32>) -> memref<10xf32> { - %0 = affine.if #set() -> memref<10xf32> { - %alloc = memref.alloc() : memref<10xf32> - affine.yield %alloc : memref<10xf32> - } else { - affine.yield %arg0 : memref<10xf32> - } - return %0 : memref<10xf32> -} -// CHECK-NEXT: %[[IF:.*]] = affine.if -// CHECK-NEXT: %[[MEMREF:.*]] = memref.alloc() : memref<10xf32> -// CHECK-NEXT: %[[CLONED:.*]] = bufferization.clone %[[MEMREF]] : memref<10xf32> to memref<10xf32> -// CHECK-NEXT: memref.dealloc %[[MEMREF]] : memref<10xf32> -// CHECK-NEXT: affine.yield %[[CLONED]] : memref<10xf32> -// CHECK-NEXT: } else { -// CHECK-NEXT: %[[ARG0_CLONE:.*]] = bufferization.clone %[[ARG0]] : memref<10xf32> to memref<10xf32> -// CHECK-NEXT: affine.yield %[[ARG0_CLONE]] : memref<10xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[IF]] : memref<10xf32> - -// ----- - -// Memref allocated before parent IfOp and used in `then` region. -// Expected result: deallocation should happen after affine.if op. -#set = affine_set<() : (0 >= 0)> -// CHECK-LABEL: func @test_affine_if_2() -> memref<10xf32> { -func.func @test_affine_if_2() -> memref<10xf32> { - %alloc0 = memref.alloc() : memref<10xf32> - %0 = affine.if #set() -> memref<10xf32> { - affine.yield %alloc0 : memref<10xf32> - } else { - %alloc = memref.alloc() : memref<10xf32> - affine.yield %alloc : memref<10xf32> - } - return %0 : memref<10xf32> -} -// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc() : memref<10xf32> -// CHECK-NEXT: %[[IF_RES:.*]] = affine.if {{.*}} -> memref<10xf32> { -// CHECK-NEXT: %[[ALLOC_CLONE:.*]] = bufferization.clone %[[ALLOC]] : memref<10xf32> to memref<10xf32> -// CHECK-NEXT: affine.yield %[[ALLOC_CLONE]] : memref<10xf32> -// CHECK-NEXT: } else { -// CHECK-NEXT: %[[ALLOC2:.*]] = memref.alloc() : memref<10xf32> -// CHECK-NEXT: %[[ALLOC2_CLONE:.*]] = bufferization.clone %[[ALLOC2]] : memref<10xf32> to memref<10xf32> -// CHECK-NEXT: memref.dealloc %[[ALLOC2]] : memref<10xf32> -// CHECK-NEXT: affine.yield %[[ALLOC2_CLONE]] : memref<10xf32> -// CHECK-NEXT: } -// CHECK-NEXT: memref.dealloc %[[ALLOC]] : memref<10xf32> -// CHECK-NEXT: return %[[IF_RES]] : memref<10xf32> - -// ----- - -// Memref allocated before parent IfOp and used in `else` region. -// Expected result: deallocation should happen after affine.if op. -#set = affine_set<() : (0 >= 0)> -// CHECK-LABEL: func @test_affine_if_3() -> memref<10xf32> { -func.func @test_affine_if_3() -> memref<10xf32> { - %alloc0 = memref.alloc() : memref<10xf32> - %0 = affine.if #set() -> memref<10xf32> { - %alloc = memref.alloc() : memref<10xf32> - affine.yield %alloc : memref<10xf32> - } else { - affine.yield %alloc0 : memref<10xf32> - } - return %0 : memref<10xf32> -} -// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc() : memref<10xf32> -// CHECK-NEXT: %[[IFRES:.*]] = affine.if {{.*}} -> memref<10xf32> { -// CHECK-NEXT: memref.alloc -// CHECK-NEXT: bufferization.clone -// CHECK-NEXT: memref.dealloc -// CHECK-NEXT: affine.yield -// CHECK-NEXT: } else { -// CHECK-NEXT: bufferization.clone -// CHECK-NEXT: affine.yield -// CHECK-NEXT: } -// CHECK-NEXT: memref.dealloc %[[ALLOC]] : memref<10xf32> -// CHECK-NEXT: return %[[IFRES]] : memref<10xf32> - -// ----- - -// Memref allocated before parent IfOp and not used later. -// Expected result: deallocation should happen before affine.if op. -#set = affine_set<() : (0 >= 0)> -// CHECK-LABEL: func @test_affine_if_4({{.*}}: memref<10xf32>) -> memref<10xf32> { -func.func @test_affine_if_4(%arg0 : memref<10xf32>) -> memref<10xf32> { - %alloc0 = memref.alloc() : memref<10xf32> - %0 = affine.if #set() -> memref<10xf32> { - affine.yield %arg0 : memref<10xf32> - } else { - %alloc = memref.alloc() : memref<10xf32> - affine.yield %alloc : memref<10xf32> - } - return %0 : memref<10xf32> -} -// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc() : memref<10xf32> -// CHECK-NEXT: memref.dealloc %[[ALLOC]] : memref<10xf32> -// CHECK-NEXT: affine.if - -// ----- - -// Ensure we free the realloc, not the alloc. - -// CHECK-LABEL: func @auto_dealloc() -func.func @auto_dealloc() { - %c10 = arith.constant 10 : index - %c100 = arith.constant 100 : index - %alloc = memref.alloc(%c10) : memref - %realloc = memref.realloc %alloc(%c100) : memref to memref - return -} -// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index -// CHECK-DAG: %[[C100:.*]] = arith.constant 100 : index -// CHECK-NEXT: %[[A:.*]] = memref.alloc(%[[C10]]) : memref -// CHECK-NEXT: %[[R:.*]] = memref.realloc %alloc(%[[C100]]) : memref to memref -// CHECK-NEXT: memref.dealloc %[[R]] : memref -// CHECK-NEXT: return - - diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir index cb8064411bbae..b2b29b2b2fee2 100644 --- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir +++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir @@ -46,6 +46,34 @@ func.func @dynamic_elem_pack(%arg0: tensor, %dest: tensor) // ----- +#map0 = affine_map<(d0, d1) -> (d0, d1)> +func.func @dynamic_elem_pack_padding_value(%arg0: tensor, %dest: tensor) -> tensor +{ + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 3.000000e+00 : f32 + %0 = tensor.dim %arg0, %c0 : tensor + %1 = tensor.dim %arg0, %c1 : tensor + %2 = tensor.empty(%0, %1) : tensor + %3 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel", "parallel"]} + ins(%arg0 : tensor) + outs(%2 : tensor) { + ^bb0(%arg3: f32, %arg4: f32): + %4 = arith.addf %arg3, %arg3 : f32 + linalg.yield %4 : f32 + } -> tensor + %4 = tensor.pack %3 padding_value(%cst : f32) + inner_dims_pos = [0, 1] + inner_tiles = [8, 2] + into %dest : tensor -> tensor + return %4 : tensor +} +// CHECK-LABEL: func.func @dynamic_elem_pack_padding_value +// CHECK: %[[GENERIC:.+]] = linalg.generic +// CHECK: tensor.pack %[[GENERIC]] + +// ----- + #map0 = affine_map<(d0, d1) -> (d0, d1)> func.func @elem_pack_transpose_inner_dims(%arg0: tensor<128x256xi32>, %dest: tensor<4x16x16x32xi32>) -> tensor<4x16x16x32xi32>{ %init = tensor.empty() : tensor<128x256xi32> diff --git a/mlir/test/Dialect/Mesh/canonicalization.mlir b/mlir/test/Dialect/Mesh/canonicalization.mlir index f0112d689805d..aff07bbf8a214 100644 --- a/mlir/test/Dialect/Mesh/canonicalization.mlir +++ b/mlir/test/Dialect/Mesh/canonicalization.mlir @@ -207,4 +207,42 @@ func.func @test_shard_offs() -> !mesh.sharding { // CHECK mesh.sharding @mesh4x4 split_axes = [[0], [1]] sharded_dims_offsets = [0, 1, 2, 3, 4, 0, 2, 3, 4, 22] : !mesh.sharding %sharding = mesh.sharding @mesh4x4 split_axes = [[0], [1]] sharded_dims_offsets = [0, 1, %c2_i64, 3, 4, 0, %c2_i64, 3, 4, 22] : !mesh.sharding return %sharding : !mesh.sharding -} \ No newline at end of file +} + +// CHECK-LABEL: func @test_duplicate_shardops +func.func @test_duplicate_shardops() -> (tensor<1024x1024xf32>, tensor<1024x1024xf32>) attributes {llvm.emit_c_interface} { + // CHECK-NEXT: [[vcst:%.*]] = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + // CHECK-NEXT: [[vsharding:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0, 1]] : !mesh.sharding + %sharding_1 = mesh.sharding @mesh4x4 split_axes = [[0, 1]] : !mesh.sharding + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + %sharding_2 = mesh.sharding @mesh4x4 split_axes = [[0, 1]] : !mesh.sharding + %sharding_annotated_2 = mesh.shard %cst_2 to %sharding_2 : tensor<1024x1024xf32> + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + %sharding_3 = mesh.sharding @mesh4x4 split_axes = [[0, 1]] : !mesh.sharding + %sharding_annotated_3 = mesh.shard %cst_3 to %sharding_3 : tensor<1024x1024xf32> + // CHECK-NEXT: [[vsharding_annotated:%.*]] = mesh.shard [[vcst]] to [[vsharding]] : tensor<1024x1024xf32> + %sharding_annotated_1 = mesh.shard %cst_1 to %sharding_1 : tensor<1024x1024xf32> + // CHECK-NEXT: return [[vsharding_annotated]], [[vsharding_annotated]] : tensor<1024x1024xf32>, tensor<1024x1024xf32> + return %sharding_annotated_1, %sharding_annotated_2 : tensor<1024x1024xf32>, tensor<1024x1024xf32> +} + +// CHECK-LABEL: func @test_duplicate_shardops_diff +func.func @test_duplicate_shardops_diff() -> (tensor<1024x1024xf32>, tensor<1024x1024xf32>) attributes {llvm.emit_c_interface} { + // CHECK-NEXT: [[vcst:%.*]] = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + // CHECK-NEXT: [[vsharding:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0]] : !mesh.sharding + %sharding_1 = mesh.sharding @mesh4x4 split_axes = [[0]] : !mesh.sharding + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + // CHECK-NEXT: [[vsharding_0:%.*]] = mesh.sharding @mesh4x4 split_axes = {{\[\[}}0, 1]] : !mesh.sharding + %sharding_2 = mesh.sharding @mesh4x4 split_axes = [[0, 1]] : !mesh.sharding + // CHECK-NEXT: [[vsharding_annotated:%.*]] = mesh.shard [[vcst]] to [[vsharding_0]] : tensor<1024x1024xf32> + %sharding_annotated_2 = mesh.shard %cst_2 to %sharding_2 : tensor<1024x1024xf32> + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1024x1024xf32> + %sharding_3 = mesh.sharding @mesh4x4 split_axes = [[0]] : !mesh.sharding + %sharding_annotated_3 = mesh.shard %cst_3 to %sharding_3 : tensor<1024x1024xf32> + // CHECK-NEXT: [[vsharding_annotated_1:%.*]] = mesh.shard [[vsharding_annotated]] to [[vsharding]] : tensor<1024x1024xf32> + %sharding_annotated_1 = mesh.shard %cst_1 to %sharding_1 : tensor<1024x1024xf32> + // CHECK-NEXT: return [[vsharding_annotated_1]], [[vsharding_annotated]] : tensor<1024x1024xf32>, tensor<1024x1024xf32> + return %sharding_annotated_1, %sharding_annotated_2 : tensor<1024x1024xf32>, tensor<1024x1024xf32> +} diff --git a/mlir/test/Dialect/Mesh/ops.mlir b/mlir/test/Dialect/Mesh/ops.mlir index 978de4939ee77..43a75bf3d8040 100644 --- a/mlir/test/Dialect/Mesh/ops.mlir +++ b/mlir/test/Dialect/Mesh/ops.mlir @@ -164,6 +164,14 @@ func.func @mesh_shard_shape() { return } +// CHECK-LABEL: func @mesh_get_sharding +// CHECK-SAME: %[[ARG:.*]]: tensor<4x8xf32> +func.func @mesh_get_sharding(%arg0 : tensor<4x8xf32>) -> !mesh.sharding { + // CHECK-NEXT: mesh.get_sharding %[[ARG]] : tensor<4x8xf32> -> !mesh.sharding + %0 = mesh.get_sharding %arg0 : tensor<4x8xf32> -> !mesh.sharding + return %0 : !mesh.sharding +} + // CHECK-LABEL: func @mesh_shape func.func @mesh_shape() -> (index, index) { // CHECK: %[[RES:.*]]:2 = mesh.mesh_shape @mesh0 axes = [0, 1] : index, index diff --git a/mlir/test/Dialect/Mesh/spmdization.mlir b/mlir/test/Dialect/Mesh/spmdization.mlir index c1b96fda0f4a7..59f7162e21013 100644 --- a/mlir/test/Dialect/Mesh/spmdization.mlir +++ b/mlir/test/Dialect/Mesh/spmdization.mlir @@ -4,6 +4,20 @@ mesh.mesh @mesh_1d(shape = 2) +// CHECK-LABEL: func @return_sharding +func.func @return_sharding( + // CHECK-SAME: [[ARG:%.*]]: tensor<1xf32> + %arg0: tensor<2xf32> +// CHECK-SAME: ) -> (tensor<1xf32>, !mesh.sharding) { +) -> (tensor<2xf32>, !mesh.sharding) { + %ssharding_annotated = mesh.sharding @mesh_1d split_axes = [[0]] : !mesh.sharding + %sharding_annotated = mesh.shard %arg0 to %ssharding_annotated : tensor<2xf32> + // CHECK-NEXT: [[vsharding:%.*]] = mesh.sharding @mesh_1d split_axes = {{\[\[}}0]] : !mesh.sharding + %r = mesh.get_sharding %sharding_annotated : tensor<2xf32> -> !mesh.sharding + // CHECK-NEXT: return [[ARG]], [[vsharding]] : tensor<1xf32>, !mesh.sharding + return %sharding_annotated, %r : tensor<2xf32>, !mesh.sharding +} + // CHECK-LABEL: func @full_replication func.func @full_replication( // CHECK-SAME: %[[ARG:.*]]: tensor<2xi8> diff --git a/mlir/test/Dialect/SCF/loop-unroll.mlir b/mlir/test/Dialect/SCF/loop-unroll.mlir index 0368505a1b70d..4c72d9e99d049 100644 --- a/mlir/test/Dialect/SCF/loop-unroll.mlir +++ b/mlir/test/Dialect/SCF/loop-unroll.mlir @@ -3,9 +3,9 @@ // RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 loop-depth=0' | FileCheck %s --check-prefix UNROLL-OUTER-BY-2 // RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 loop-depth=1' | FileCheck %s --check-prefix UNROLL-INNER-BY-2 // RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 annotate=true' | FileCheck %s --check-prefix UNROLL-BY-2-ANNOTATE -// RUN: mlir-opt %s --affine-loop-unroll='unroll-factor=6 unroll-up-to-factor=true' | FileCheck %s --check-prefix UNROLL-UP-TO -// RUN: mlir-opt %s --affine-loop-unroll='unroll-factor=5 cleanup-unroll=true' | FileCheck %s --check-prefix CLEANUP-UNROLL-BY-5 -// RUN: mlir-opt %s --affine-loop-unroll --split-input-file | FileCheck %s +// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=6 unroll-up-to-factor=true}))" | FileCheck %s --check-prefix UNROLL-UP-TO +// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=5 cleanup-unroll=true}))" | FileCheck %s --check-prefix CLEANUP-UNROLL-BY-5 +// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll))" --split-input-file | FileCheck %s func.func @dynamic_loop_unroll(%arg0 : index, %arg1 : index, %arg2 : index, %arg3: memref) { diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir index 582fd77cd7bc8..24d572244a9b0 100644 --- a/mlir/test/Dialect/Tosa/canonicalize.mlir +++ b/mlir/test/Dialect/Tosa/canonicalize.mlir @@ -52,25 +52,16 @@ func.func @cast_nofold(%arg0: tensor) -> tensor { // CHECK-LABEL: @clamp_i32_not_noop func.func @clamp_i32_not_noop(%arg0: tensor<4xi32>) -> tensor<4xi32> { // CHECK: tosa.clamp - %0 = tosa.clamp %arg0 {min_int = 1 : i64, max_int = 4 : i64, min_fp = 1.0 : f32, max_fp = 4.0 : f32} : (tensor<4xi32>) -> tensor<4xi32> + %0 = tosa.clamp %arg0 {min_val = 1 : i32, max_val = 4 : i32} : (tensor<4xi32>) -> tensor<4xi32> return %0 : tensor<4xi32> } // ----- -// CHECK-LABEL: @clamp_f16_not_noop -func.func @clamp_f16_not_noop(%arg0: tensor<4xf16>) -> tensor<4xf16> { - // CHECK: tosa.clamp - %0 = tosa.clamp %arg0 {min_int = -128 : i64, max_int = 127 : i64, min_fp = -3.40282347E+38 : f32, max_fp = 3.40282347E+38 : f32} : (tensor<4xf16>) -> tensor<4xf16> - return %0 : tensor<4xf16> -} - -// ----- - // CHECK-LABEL: @clamp_f32_not_noop func.func @clamp_f32_not_noop(%arg0: tensor<4xf32>) -> tensor<4xf32> { // CHECK: tosa.clamp - %0 = tosa.clamp %arg0 {min_int = -128 : i64, max_int = 127 : i64, min_fp = -3.40282347E+38 : f32, max_fp = 3.40282347E+38 : f32} : (tensor<4xf32>) -> tensor<4xf32> + %0 = tosa.clamp %arg0 {min_val = -3.40282347E+38 : f32, max_val = 3.40282347E+38 : f32} : (tensor<4xf32>) -> tensor<4xf32> return %0 : tensor<4xf32> } @@ -80,8 +71,8 @@ func.func @clamp_f32_not_noop(%arg0: tensor<4xf32>) -> tensor<4xf32> { func.func @clamp_f16_is_noop(%arg0: tensor<4xf16>) -> tensor<4xf16> { // CHECK: return %arg0 // CHECK-NOT: "tosa.clamp" - // 0xFF800000 and 0x7F800000 are respectively negative and positive F32 infinity. - %0 = tosa.clamp %arg0 {min_int = -128 : i64, max_int = 127 : i64, min_fp = 0xFF800000 : f32, max_fp = 0x7F800000 : f32} : (tensor<4xf16>) -> tensor<4xf16> + // 0x7C00 and 0xFC00 are respectively positive and negative F32 infinity. + %0 = tosa.clamp %arg0 {max_val = 0x7C00 : f16, min_val = 0xFC00 : f16} : (tensor<4xf16>) -> tensor<4xf16> return %0 : tensor<4xf16> } @@ -92,7 +83,7 @@ func.func @clamp_f32_is_noop(%arg0: tensor<4xf32>) -> tensor<4xf32> { // CHECK: return %arg0 // CHECK-NOT: "tosa.clamp" // 0xFF800000 and 0x7F800000 are respectively negative and positive F32 infinity. - %0 = tosa.clamp %arg0 {min_int = -128 : i64, max_int = 127 : i64, min_fp = 0xFF800000 : f32, max_fp = 0x7F800000 : f32} : (tensor<4xf32>) -> tensor<4xf32> + %0 = tosa.clamp %arg0 {min_val = 0xFF800000 : f32, max_val = 0x7F800000 : f32} : (tensor<4xf32>) -> tensor<4xf32> return %0 : tensor<4xf32> } @@ -102,7 +93,7 @@ func.func @clamp_f32_is_noop(%arg0: tensor<4xf32>) -> tensor<4xf32> { func.func @clamp_int8_is_noop(%arg0: tensor<4xi8>) -> tensor<4xi8> { // CHECK: return %arg0 // CHECK-NOT: tosa.clamp - %0 = tosa.clamp %arg0 {min_int = -128 : i64, max_int = 127 : i64, min_fp = -3.40282347E+38 : f32, max_fp = 3.40282347E+38 : f32} : (tensor<4xi8>) -> tensor<4xi8> + %0 = tosa.clamp %arg0 {min_val = -128 : i8, max_val = 127 : i8} : (tensor<4xi8>) -> tensor<4xi8> return %0 : tensor<4xi8> } @@ -112,7 +103,7 @@ func.func @clamp_int8_is_noop(%arg0: tensor<4xi8>) -> tensor<4xi8> { func.func @clamp_int16_is_noop(%arg0: tensor<4xi16>) -> tensor<4xi16> { // CHECK: return %arg0 // CHECK-NOT: tosa.clamp - %0 = tosa.clamp %arg0 {min_int = -32768 : i64, max_int = 32767 : i64, min_fp = -3.40282347E+38 : f32, max_fp = 3.40282347E+38 : f32} : (tensor<4xi16>) -> tensor<4xi16> + %0 = tosa.clamp %arg0 {min_val = -32768 : i16, max_val = 32767 : i16} : (tensor<4xi16>) -> tensor<4xi16> return %0 : tensor<4xi16> } @@ -122,7 +113,7 @@ func.func @clamp_int16_is_noop(%arg0: tensor<4xi16>) -> tensor<4xi16> { func.func @clamp_uint8_is_noop(%arg0: tensor<4xui8>) -> tensor<4xui8> { // CHECK: return %arg0 // CHECK-NOT: tosa.clamp - %0 = tosa.clamp %arg0 {min_int = 0 : i64, max_int = 255 : i64, min_fp = -3.40282347E+38 : f32, max_fp = 3.40282347E+38 : f32} : (tensor<4xui8>) -> tensor<4xui8> + %0 = tosa.clamp %arg0 {min_val = 0 : ui8, max_val = 255 : ui8} : (tensor<4xui8>) -> tensor<4xui8> return %0 : tensor<4xui8> } @@ -130,9 +121,9 @@ func.func @clamp_uint8_is_noop(%arg0: tensor<4xui8>) -> tensor<4xui8> { // CHECK-LABEL: @clamp_twice_is_single_clamp func.func @clamp_twice_is_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { - // CHECK: tosa.clamp %arg0 {max_fp = 3.000000e+00 : f32, max_int = 2 : i64, min_fp = -3.000000e+00 : f32, min_int = -2 : i64} - %0 = tosa.clamp %arg0 {max_fp = 3.0 : f32, max_int = 4 : i64, min_fp = -5.0 : f32, min_int = -2 : i64} : (tensor<4xi8>) -> tensor<4xi8> - %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 2 : i64, min_fp = -3.0 : f32, min_int = -4 : i64} : (tensor<4xi8>) -> tensor<4xi8> + // CHECK: tosa.clamp %arg0 {max_val = 2 : i8, min_val = -2 : i8} + %0 = tosa.clamp %arg0 {max_val = 4 : i8, min_val = -2 : i8} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_val = 2 : i8, min_val = -4 : i8} : (tensor<4xi8>) -> tensor<4xi8> return %1 : tensor<4xi8> } @@ -140,10 +131,10 @@ func.func @clamp_twice_is_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { // CHECK: @disjoint_clamp_twice_is_not_single_clamp(%[[INPUT:.*]]: tensor<4xi8>) func.func @disjoint_clamp_twice_is_not_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { - // CHECK: %[[CLAMP_1:.*]] = tosa.clamp %[[INPUT]] {max_fp = -5.000000e+00 : f32, max_int = -5 : i64, min_fp = -1.000000e+00 : f32, min_int = -10 : i64} : (tensor<4xi8>) -> tensor<4xi8> - // CHECK-NEXT: tosa.clamp %[[CLAMP_1]] {max_fp = 5.000000e+00 : f32, max_int = 5 : i64, min_fp = 1.000000e+00 : f32, min_int = 1 : i64} : (tensor<4xi8>) -> tensor<4xi8> - %0 = tosa.clamp %arg0 {max_fp = -5.0 : f32, max_int = -5 : i64, min_fp = -1.0 : f32, min_int = -10 : i64} : (tensor<4xi8>) -> tensor<4xi8> - %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 5 : i64, min_fp = 1.0 : f32, min_int = 1 : i64} : (tensor<4xi8>) -> tensor<4xi8> + // CHECK: %[[CLAMP_1:.*]] = tosa.clamp %[[INPUT]] {max_val = -5 : i8, min_val = -10 : i8} : (tensor<4xi8>) -> tensor<4xi8> + // CHECK-NEXT: tosa.clamp %[[CLAMP_1]] {max_val = 5 : i8, min_val = 1 : i8} : (tensor<4xi8>) -> tensor<4xi8> + %0 = tosa.clamp %arg0 {max_val = -5 : i8, min_val = -10 : i8} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_val = 5 : i8, min_val = 1 : i8} : (tensor<4xi8>) -> tensor<4xi8> return %1 : tensor<4xi8> } @@ -151,9 +142,9 @@ func.func @disjoint_clamp_twice_is_not_single_clamp(%arg0: tensor<4xi8>) -> tens // CHECK-LABEL: @clamp_twice_with_nan_propagate_is_single_clamp func.func @clamp_twice_with_nan_propagate_is_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { - // CHECK: tosa.clamp %arg0 {max_fp = 3.000000e+00 : f32, max_int = 2 : i64, min_fp = -3.000000e+00 : f32, min_int = -2 : i64} - %0 = tosa.clamp %arg0 {max_fp = 3.0 : f32, max_int = 4 : i64, min_fp = -5.0 : f32, min_int = -2 : i64, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> - %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 2 : i64, min_fp = -3.0 : f32, min_int = -4 : i64, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> + // CHECK: tosa.clamp %arg0 {max_val = 2 : i8, min_val = -2 : i8} + %0 = tosa.clamp %arg0 {max_val = 4 : i8, min_val = -2 : i8, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_val = 2 : i8, min_val = -4 : i8, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> return %1 : tensor<4xi8> } @@ -161,9 +152,9 @@ func.func @clamp_twice_with_nan_propagate_is_single_clamp(%arg0: tensor<4xi8>) - // CHECK-LABEL: @clamp_twice_with_nan_ignore_is_single_clamp func.func @clamp_twice_with_nan_ignore_is_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { - // CHECK: tosa.clamp %arg0 {max_fp = 3.000000e+00 : f32, max_int = 2 : i64, min_fp = -3.000000e+00 : f32, min_int = -2 : i64, nan_mode = "IGNORE"} - %0 = tosa.clamp %arg0 {max_fp = 3.0 : f32, max_int = 4 : i64, min_fp = -5.0 : f32, min_int = -2 : i64, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> - %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 2 : i64, min_fp = -3.0 : f32, min_int = -4 : i64, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> + // CHECK: tosa.clamp %arg0 {max_val = 2 : i8, min_val = -2 : i8, nan_mode = "IGNORE"} + %0 = tosa.clamp %arg0 {max_val = 4 : i8, min_val = -2 : i8, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_val = 2 : i8, min_val = -4 : i8, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> return %1 : tensor<4xi8> } @@ -171,9 +162,9 @@ func.func @clamp_twice_with_nan_ignore_is_single_clamp(%arg0: tensor<4xi8>) -> t // CHECK-LABEL: @clamp_twice_with_nan_ignore_propagate_is_single_clamp func.func @clamp_twice_with_nan_ignore_propagate_is_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { - // CHECK: tosa.clamp %arg0 {max_fp = 3.000000e+00 : f32, max_int = 2 : i64, min_fp = -3.000000e+00 : f32, min_int = -2 : i64, nan_mode = "IGNORE"} - %0 = tosa.clamp %arg0 {max_fp = 3.0 : f32, max_int = 4 : i64, min_fp = -5.0 : f32, min_int = -2 : i64, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> - %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 2 : i64, min_fp = -3.0 : f32, min_int = -4 : i64, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> + // CHECK: tosa.clamp %arg0 {max_val = 2 : i8, min_val = -2 : i8, nan_mode = "IGNORE"} + %0 = tosa.clamp %arg0 {max_val = 4 : i8, min_val = -2 : i8, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_val = 2 : i8, min_val = -4 : i8, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> return %1 : tensor<4xi8> } @@ -181,10 +172,10 @@ func.func @clamp_twice_with_nan_ignore_propagate_is_single_clamp(%arg0: tensor<4 // CHECK: @clamp_twice_with_nan_propagate_ignore_is_not_single_clamp(%[[INPUT:.*]]: tensor<4xi8>) func.func @clamp_twice_with_nan_propagate_ignore_is_not_single_clamp(%arg0: tensor<4xi8>) -> tensor<4xi8> { - // CHECK: %[[CLAMP_1:.*]] = tosa.clamp %[[INPUT]] {max_fp = 3.000000e+00 : f32, max_int = 4 : i64, min_fp = -5.000000e+00 : f32, min_int = -2 : i64} : (tensor<4xi8>) -> tensor<4xi8> - // CHECK-NEXT: tosa.clamp %[[CLAMP_1]] {max_fp = 5.000000e+00 : f32, max_int = 2 : i64, min_fp = -3.000000e+00 : f32, min_int = -4 : i64, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> - %0 = tosa.clamp %arg0 {max_fp = 3.0 : f32, max_int = 4 : i64, min_fp = -5.0 : f32, min_int = -2 : i64, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> - %1 = tosa.clamp %0 {max_fp = 5.0 : f32, max_int = 2 : i64, min_fp = -3.0 : f32, min_int = -4 : i64, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> + // CHECK: %[[CLAMP_1:.*]] = tosa.clamp %[[INPUT]] {max_val = 4 : i8, min_val = -2 : i8} : (tensor<4xi8>) -> tensor<4xi8> + // CHECK-NEXT: tosa.clamp %[[CLAMP_1]] {max_val = 2 : i8, min_val = -4 : i8, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> + %0 = tosa.clamp %arg0 {max_val = 4 : i8, min_val = -2 : i8, nan_mode = "PROPAGATE"} : (tensor<4xi8>) -> tensor<4xi8> + %1 = tosa.clamp %0 {max_val = 2 : i8, min_val = -4 : i8, nan_mode = "IGNORE"} : (tensor<4xi8>) -> tensor<4xi8> return %1 : tensor<4xi8> } @@ -331,8 +322,9 @@ func.func @pad_determine_val_quant(%arg0: tensor, %arg1 : tensor<2x2xi3 func.func @mul_one_float(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> { // CHECK: return %arg0 // CHECK-NOT: tosa.mul + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> %ones = "tosa.const"() {value = dense<1.0> : tensor<2x3xf32>} : () -> tensor<2x3xf32> - %1 = tosa.mul %arg0, %ones : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32> + %1 = tosa.mul %arg0, %ones, %shift : (tensor<2x3xf32>, tensor<2x3xf32>, tensor<1xi8>) -> tensor<2x3xf32> return %1 : tensor<2x3xf32> } @@ -343,7 +335,8 @@ func.func @mul_bcast_one_float(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> { // CHECK: return %arg0 // CHECK-NOT: tosa.mul %ones = "tosa.const"() {value = dense<1.0> : tensor<1x1xf32>} : () -> tensor<1x1xf32> - %1 = tosa.mul %ones, %arg0 : (tensor<1x1xf32>, tensor<2x3xf32>) -> tensor<2x3xf32> + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %1 = tosa.mul %ones, %arg0, %shift : (tensor<1x1xf32>, tensor<2x3xf32>, tensor<1xi8>) -> tensor<2x3xf32> return %1 : tensor<2x3xf32> } @@ -379,11 +372,12 @@ func.func @mul_zero_broadcast(%arg0: tensor<2x3xf32>) -> (tensor<2x3xf32>, tenso // CHECK: %[[ZERO:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<2x3xf32>} // CHECK-NOT: tosa.mul %zeros = "tosa.const"() {value = dense<0.0> : tensor<1x1xf32>} : () -> tensor<1x1xf32> - %1 = tosa.mul %arg0, %zeros : (tensor<2x3xf32>, tensor<1x1xf32>) -> tensor<2x3xf32> + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %1 = tosa.mul %arg0, %zeros, %shift : (tensor<2x3xf32>, tensor<1x1xf32>, tensor<1xi8>) -> tensor<2x3xf32> // CHECK-NOT: tosa.mul // CHECK: return %[[ZERO]], %[[ZERO]] - %2 = tosa.mul %zeros, %arg0 : (tensor<1x1xf32>, tensor<2x3xf32>) -> tensor<2x3xf32> + %2 = tosa.mul %zeros, %arg0, %shift : (tensor<1x1xf32>, tensor<2x3xf32>, tensor<1xi8>) -> tensor<2x3xf32> return %1, %2 : tensor<2x3xf32>, tensor<2x3xf32> } @@ -983,7 +977,8 @@ func.func @mul_quant_nofold() -> tensor<1x!quant.uniform : tensor<1xi8>} : () -> tensor<1x!quant.uniform> %1 = "tosa.const"() {value = dense<1> : tensor<1xi8>} : () -> tensor<1x!quant.uniform> - %2 = tosa.mul %0, %1 : (tensor<1x!quant.uniform>, tensor<1x!quant.uniform>)-> tensor<1x!quant.uniform> + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %2 = tosa.mul %0, %1, %shift : (tensor<1x!quant.uniform>, tensor<1x!quant.uniform>, tensor<1xi8>) -> tensor<1x!quant.uniform> return %2 : tensor<1x!quant.uniform> } diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir index 40469987d89d0..e6fb741df9598 100644 --- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir +++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir @@ -238,7 +238,8 @@ func.func @fold_div_splat_i32() -> tensor { func.func @fold_mul_zero_rhs_f32(%arg0: tensor) -> tensor { %zero = "tosa.const"() {value = dense<0.0> : tensor} : () -> tensor // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0.000000e+00> - %mul = tosa.mul %arg0, %zero : (tensor, tensor) -> tensor + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %mul = tosa.mul %arg0, %zero, %shift : (tensor, tensor, tensor<1xi8>) -> tensor // CHECK: return %[[ZERO]] return %mul : tensor } @@ -249,7 +250,8 @@ func.func @fold_mul_zero_rhs_f32(%arg0: tensor) -> tensor { func.func @fold_mul_zero_lhs_f32(%arg0: tensor) -> tensor { %zero = "tosa.const"() {value = dense<0.0> : tensor} : () -> tensor // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0.000000e+00> - %mul = tosa.mul %zero, %arg0 : (tensor, tensor) -> tensor + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %mul = tosa.mul %zero, %arg0, %shift : (tensor, tensor, tensor<1xi8>) -> tensor // CHECK: return %[[ZERO]] return %mul : tensor } @@ -283,7 +285,8 @@ func.func @fold_mul_zero_lhs_i32(%arg0: tensor) -> tensor { // CHECK-LABEL: @fold_mul_one_rhs_f32 func.func @fold_mul_one_rhs_f32(%arg0: tensor) -> tensor { %one = "tosa.const"() {value = dense<1.0> : tensor} : () -> tensor - %mul = tosa.mul %arg0, %one : (tensor, tensor) -> tensor + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %mul = tosa.mul %arg0, %one, %shift : (tensor, tensor, tensor<1xi8>) -> tensor // CHECK: return %arg0 return %mul : tensor } @@ -293,7 +296,8 @@ func.func @fold_mul_one_rhs_f32(%arg0: tensor) -> tensor { // CHECK-LABEL: @fold_mul_one_lhs_f32 func.func @fold_mul_one_lhs_f32(%arg0: tensor) -> tensor { %one = "tosa.const"() {value = dense<1.0> : tensor} : () -> tensor - %mul = tosa.mul %one, %arg0 : (tensor, tensor) -> tensor + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %mul = tosa.mul %one, %arg0, %shift : (tensor, tensor, tensor<1xi8>) -> tensor // CHECK: return %arg0 return %mul : tensor } @@ -339,7 +343,8 @@ func.func @fold_mul_splat_i8() -> tensor<10xi32> { func.func @fold_mul_splat_f32() -> tensor<10xf32> { %one = "tosa.const"() {value = dense<3.0> : tensor<10xf32>} : () -> tensor<10xf32> %two = "tosa.const"() {value = dense<2.0> : tensor<10xf32>} : () -> tensor<10xf32> - %mul = tosa.mul %one, %two : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32> + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %mul = tosa.mul %one, %two, %shift : (tensor<10xf32>, tensor<10xf32>, tensor<1xi8>) -> tensor<10xf32> // CHECK: %[[THREE:.+]] = "tosa.const"() <{value = dense<6.000000e+00> : tensor<10xf32>} // CHECK: return %[[THREE]] return %mul : tensor<10xf32> diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index 20fc10d77d0e0..ff874af5c5f07 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -768,26 +768,27 @@ func.func @test_transpose_conv2d_invalid_outshape(%arg0: tensor<1x32x32x8xf32>, // CHECK-LABEL: test_mul_type_mismatch func.func @test_mul_type_mismatch(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf16>) -> tensor<13x21x3xf32> { + %shift = "tosa.const"() {value = dense<0> : tensor<1xi8>} : () -> tensor<1xi8> // expected-error@+1 {{'tosa.mul' op requires the same element type for all operands}} - %0 = tosa.mul %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x1x3xf16>) -> tensor<13x21x3xf32> + %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xf32>, tensor<13x1x3xf16>, tensor<1xi8>) -> tensor<13x21x3xf32> return %0 : tensor<13x21x3xf32> } // ----- // CHECK-LABEL: test_mul_invalid_shift -func.func @test_mul_invalid_shift(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>) -> tensor<13x21x3xi32> { - %shift = "tosa.const"() {value = dense<0.0> : tensor} : () -> tensor - // expected-error@+1 {{'tosa.mul' op operand #2 must be 1D tensor of 8-bit signless integer values, but got 'tensor'}} - %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xi32>, tensor<13x1x3xi32>, tensor) -> tensor<13x21x3xi32> - return %0 : tensor<13x21x3xi32> +func.func @test_mul_invalid_shift(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xf32> { + %shift = "tosa.const"() {value = dense<1> : tensor<1xi8>} : () -> tensor<1xi8> + // expected-error@+1 {{'tosa.mul' op require shift to be 0 for float type}} + %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xf32>, tensor<13x1x3xf32>, tensor<1xi8>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> } // ----- // CHECK-LABEL: test_mul_missing_shift func.func @test_mul_missing_shift(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>) -> tensor<13x21x3xi32> { - // this is ok because mul's shift operand is optional for now + // expected-error@+1 {{'tosa.mul' op expected 3 operands, but found 2}} %0 = tosa.mul %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x1x3xi32>) -> tensor<13x21x3xi32> return %0 : tensor<13x21x3xi32> } @@ -806,7 +807,7 @@ func.func @test_unsupported_int64_data_type(%arg0: tensor<1x13x13x5xf32>) -> ten // CHECK-LABEL: test_mismatch_in_out_data_type_clamp func.func @test_mismatch_in_out_data_type_clamp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf16> { // expected-error@+1 {{'tosa.clamp' op requires the same element type for all operands and results}} - %0 = tosa.clamp %arg0 {min_fp = 0.0 : f32, max_fp = 1.0: f32, min_int = 0 : i64, max_int = 1 : i64} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf16> + %0 = tosa.clamp %arg0 {min_val = 0.0 : f32, max_val = 1.0: f32} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf16> return %0 : tensor<13x21x3xf16> } @@ -815,7 +816,7 @@ func.func @test_mismatch_in_out_data_type_clamp(%arg0: tensor<13x21x3xf32>) -> t // CHECK-LABEL: test_mismatch_in_out_shape_clamp func.func @test_mismatch_in_out_shape_clamp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x1xf32> { // expected-error@+1 {{'tosa.clamp' op requires the same shape for all operands and results}} - %0 = tosa.clamp %arg0 {min_fp = 0.0 : f32, max_fp = 1.0: f32, min_int = 0 : i64, max_int = 1 : i64} : (tensor<13x21x3xf32>) -> tensor<13x21x1xf32> + %0 = tosa.clamp %arg0 {min_val = 0.0 : f32, max_val = 1.0: f32} : (tensor<13x21x3xf32>) -> tensor<13x21x1xf32> return %0 : tensor<13x21x1xf32> } @@ -1086,6 +1087,14 @@ func.func @test_const_shape_value() -> !tosa.shape<5> { // ----- +func.func @test_const_shape_value() -> !tosa.shape<4> { + // expected-error@+1 {{'tosa.const_shape' op expect elements in attribute value with rank 1}} + %cst = tosa.const_shape {value = dense<[[1, 2], [3, 4]]> : tensor<2x2xindex>} : () -> !tosa.shape<4> + return %cst : !tosa.shape<4> +} + +// ----- + func.func @test_sub_with_unequal_operand_ranks(%arg0: tensor<1x21x3xf32>, %arg1: tensor<1x13x21x3xf32>) -> tensor<1x13x21x3xf32> { // expected-error@+1 {{'tosa.sub' op operands don't have matching ranks}} %0 = tosa.sub %arg0, %arg1 : (tensor<1x21x3xf32>, tensor<1x13x21x3xf32>) -> tensor<1x13x21x3xf32> @@ -1099,3 +1108,30 @@ func.func @test_sub_with_unequal_result_ranks(%arg0: tensor<1x21x3xf32>, %arg1: %0 = tosa.sub %arg0, %arg1 : (tensor<1x21x3xf32>, tensor<13x21x3xf32>) -> tensor<1x13x21x3xf32> return %0 : tensor<1x13x21x3xf32> } + +// ----- +// CHECK-LABEL: test_mul_non_scalar_shift_2d +func.func @test_mul_non_scalar_shift_2d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xf32> { + %shift = "tosa.const"() <{value = dense<0> : tensor<1x1xi8>}> : () -> tensor<1x1xi8> + // expected-error@+1 {{'tosa.mul' op operand #2 must be tosa-conformant scalar tensor of 8-bit signless integer values, but got 'tensor<1x1xi8>'}} + %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xf32>, tensor<13x1x3xf32>, tensor<1x1xi8>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +// CHECK-LABEL: test_mul_non_scalar_shift_1d +func.func @test_mul_non_scalar_shift_1d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xf32> { + %shift = "tosa.const"() <{value = dense<0> : tensor<2xi8>}> : () -> tensor<2xi8> + // expected-error@+1 {{'tosa.mul' op operand #2 must be tosa-conformant scalar tensor of 8-bit signless integer values, but got 'tensor<2xi8>'}} + %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xf32>, tensor<13x1x3xf32>, tensor<2xi8>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +// CHECK-LABEL: test_mul_non_broadcast +func.func @test_mul_non_broadcast(%arg0: tensor<13x21x2xf32>, %arg1: tensor<3x1x3xf32>) -> tensor<13x21x3xf32> { + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + // expected-error@+1 {{'tosa.mul' op operands don't have broadcast-compatible shapes}} + %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x2xf32>, tensor<3x1x3xf32>, tensor<1xi8>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index d7e4f682c28b3..348849cfaa572 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -193,42 +193,42 @@ func.func @test_transpose_conv2d_with_local_bound(%arg0: tensor<1x32x32x8xf32>, // ----- // CHECK-LABEL: clamp func.func @test_clamp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { - %0 = tosa.clamp %arg0 {min_fp = 0.0 : f32, max_fp = 1.0: f32, min_int = 0 : i64, max_int = 1 : i64} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + %0 = tosa.clamp %arg0 {min_val = 0.0 : f32, max_val = 1.0: f32} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> return %0 : tensor<13x21x3xf32> } // ----- // CHECK-LABEL: clamp_propagate func.func @test_clamp_propagate(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { - %0 = tosa.clamp %arg0 {min_fp = 0.0 : f32, max_fp = 1.0: f32, min_int = 0 : i64, max_int = 1 : i64, nan_mode = "PROPAGATE"} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + %0 = tosa.clamp %arg0 {min_val = 0.0 : f32, max_val = 1.0: f32, nan_mode = "PROPAGATE"} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> return %0 : tensor<13x21x3xf32> } // ----- // CHECK-LABEL: clamp_ignore func.func @test_clamp_ignore(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { - %0 = tosa.clamp %arg0 {min_fp = 0.0 : f32, max_fp = 1.0: f32, min_int = 0 : i64, max_int = 1 : i64, nan_mode = "IGNORE"} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + %0 = tosa.clamp %arg0 {min_val = 0.0 : f32, max_val = 1.0: f32, nan_mode = "IGNORE"} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> return %0 : tensor<13x21x3xf32> } // ----- // CHECK-LABEL: clamp_f16 func.func @test_clamp_f16(%arg0: tensor<13x21x3xf16>) -> tensor<13x21x3xf16> { - %0 = tosa.clamp %arg0 {min_fp = 0.0 : f16, max_fp = 1.0: f16, min_int = 0 : i64, max_int = 1 : i64} : (tensor<13x21x3xf16>) -> tensor<13x21x3xf16> + %0 = tosa.clamp %arg0 {min_val = 0.0 : f16, max_val = 1.0: f16} : (tensor<13x21x3xf16>) -> tensor<13x21x3xf16> return %0 : tensor<13x21x3xf16> } // ----- // CHECK-LABEL: clamp_bf16 func.func @test_clamp_bf16(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { - %0 = tosa.clamp %arg0 {min_fp = 0.0 : bf16, max_fp = 1.0: bf16, min_int = 0 : i64, max_int = 1 : i64} : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + %0 = tosa.clamp %arg0 {min_val = 0.0 : bf16, max_val = 1.0: bf16} : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> return %0 : tensor<13x21x3xbf16> } // ----- // CHECK-LABEL: clamp_quantized func.func @test_clamp_quantized(%arg0: tensor<13x21x3x!quant.uniform>) -> tensor<13x21x3x!quant.uniform> { - %0 = tosa.clamp %arg0 {min_fp = 0.0 : f32, max_fp = 1.0: f32, min_int = 0 : i64, max_int = 1 : i64} : (tensor<13x21x3x!quant.uniform>) -> tensor<13x21x3x!quant.uniform> + %0 = tosa.clamp %arg0 {min_val = 0 : i8, max_val = 1 : i8} : (tensor<13x21x3x!quant.uniform>) -> tensor<13x21x3x!quant.uniform> return %0 : tensor<13x21x3x!quant.uniform> } @@ -355,7 +355,8 @@ func.func @test_mul_scalar_with_unranked_output(%arg0: tensor, %arg1: tenso // ----- // CHECK-LABEL: mul func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xf32> { - %0 = tosa.mul %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xf32> + %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xf32>, tensor<13x1x3xf32>, tensor<1xi8>) -> tensor<13x21x3xf32> return %0 : tensor<13x21x3xf32> } diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir index bdd403567a4ed..7dc9b048085fa 100644 --- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir +++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir @@ -34,7 +34,7 @@ func.func @test_unary_f32(%arg0 : tensor<4xf32>) -> () { %1 = tosa.ceil %arg0 : (tensor<4xf32>) -> tensor<*xf32> // CHECK: tosa.clamp %arg0 {{.+}} : (tensor<4xf32>) -> tensor<4xf32> - %2 = tosa.clamp %arg0 { max_int = 10 : i64, min_int = 0 : i64, min_fp = 0.0 : f32, max_fp = 10.0 : f32 } : (tensor<4xf32>) -> tensor<*xf32> + %2 = tosa.clamp %arg0 { min_val = 0.0 : f32, max_val = 10.0 : f32 } : (tensor<4xf32>) -> tensor<*xf32> // CHECK: tosa.exp %arg0 : (tensor<4xf32>) -> tensor<4xf32> %3 = tosa.exp %arg0 : (tensor<4xf32>) -> tensor<*xf32> @@ -82,7 +82,7 @@ func.func @test_unary_i32(%arg0 : tensor<4xi32>) -> () { %1 = tosa.bitwise_not %arg0 : (tensor<4xi32>) -> tensor<*xi32> // CHECK: tosa.clamp %arg0 {{.+}} : (tensor<4xi32>) -> tensor<4xi32> - %2 = tosa.clamp %arg0 { max_int = 10 : i64, min_int = 0 : i64, min_fp = 0.0 : f32, max_fp = 10.0 : f32 } : (tensor<4xi32>) -> tensor<*xi32> + %2 = tosa.clamp %arg0 { max_val = 10 : i32, min_val = 0 : i32} : (tensor<4xi32>) -> tensor<*xi32> // CHECK: tosa.clz %arg0 : (tensor<4xi32>) -> tensor<4xi32> %3 = tosa.clz %arg0 : (tensor<4xi32>) -> tensor<*xi32> @@ -114,23 +114,24 @@ func.func @test_binary_scalar_f32(%arg0 : tensor<4xf32>, %arg1 : tensor<1xf32>) // CHECK: tosa.minimum %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32> %2 = tosa.minimum %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> - // CHECK: tosa.mul %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32> - %3 = tosa.mul %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> + %3 = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + // CHECK: tosa.mul %arg0, %arg1, %3 : (tensor<4xf32>, tensor<1xf32>, tensor<1xi8>) -> tensor<4xf32> + %4 = tosa.mul %arg0, %arg1, %3 : (tensor<4xf32>, tensor<1xf32>, tensor<1xi8>) -> tensor<*xf32> // CHECK: tosa.pow %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32> - %4 = tosa.pow %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> + %5 = tosa.pow %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> // CHECK: tosa.sub %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32> - %5 = tosa.sub %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> + %6 = tosa.sub %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> // CHECK: tosa.equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xi1> - %6 = tosa.equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> + %7 = tosa.equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> // CHECK: tosa.greater %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xi1> - %7 = tosa.greater %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> + %8 = tosa.greater %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> // CHECK: tosa.greater_equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xi1> - %8 = tosa.greater_equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> + %9 = tosa.greater_equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> return } @@ -148,23 +149,24 @@ func.func @test_binary_broadcast_f32(%arg0 : tensor<4xf32>, %arg1 : tensor<1xf32 // CHECK: tosa.minimum %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32> %2 = tosa.minimum %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> - // CHECK: tosa.mul %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32> - %3 = tosa.mul %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> + %3 = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + // CHECK: tosa.mul %arg0, %arg1, %3 : (tensor<4xf32>, tensor<1xf32>, tensor<1xi8>) -> tensor<4xf32> + %4 = tosa.mul %arg0, %arg1, %3 : (tensor<4xf32>, tensor<1xf32>, tensor<1xi8>) -> tensor<*xf32> // CHECK: tosa.pow %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32> - %4 = tosa.pow %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> + %5 = tosa.pow %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> // CHECK: tosa.sub %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xf32> - %5 = tosa.sub %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> + %6 = tosa.sub %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xf32> // CHECK: tosa.equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xi1> - %6 = tosa.equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> + %7 = tosa.equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> // CHECK: tosa.greater %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xi1> - %7 = tosa.greater %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> + %8 = tosa.greater %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> // CHECK: tosa.greater_equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<4xi1> - %8 = tosa.greater_equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> + %9 = tosa.greater_equal %arg0, %arg1 : (tensor<4xf32>, tensor<1xf32>) -> tensor<*xi1> return } @@ -211,10 +213,10 @@ func.func @test_binary_i32(%arg0 : tensor<4xi32>, %arg1 : tensor<1xi32>) -> () { %11 = tosa.mul %arg0, %arg1, %shift : (tensor<4xi32>, tensor<1xi32>, tensor<1xi8>) -> tensor<*xi32> // CHECK: tosa.pow %arg0, %arg1 : (tensor<4xi32>, tensor<1xi32>) -> tensor<4xi32> - %12 = tosa.pow %arg0, %arg1 : (tensor<4xi32>, tensor<1xi32>) -> tensor<*xi32> + %13 = tosa.pow %arg0, %arg1 : (tensor<4xi32>, tensor<1xi32>) -> tensor<*xi32> // CHECK: tosa.sub %arg0, %arg1 : (tensor<4xi32>, tensor<1xi32>) -> tensor<4xi32> - %13 = tosa.sub %arg0, %arg1 : (tensor<4xi32>, tensor<1xi32>) -> tensor<*xi32> + %14 = tosa.sub %arg0, %arg1 : (tensor<4xi32>, tensor<1xi32>) -> tensor<*xi32> return } diff --git a/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir b/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir index 947335e45a9d9..3a293009a5455 100644 --- a/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir +++ b/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir @@ -22,7 +22,7 @@ func.func @test_transpose_tracks_to_nullifying_single_step(%arg0: tensor<1x2x3x4 func.func @test_transpose_tracks_to_nullifying_multi_unary_step(%arg0: tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> { %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32> - %clamp = tosa.clamp %0 {max_fp = 1.0 : f32, min_fp = 0.0 : f32, max_int = 1 : i64, min_int = 0 : i64} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> + %clamp = tosa.clamp %0 {max_val = 1 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %abs = tosa.abs %clamp : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %bitwise_not = tosa.bitwise_not %abs : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> @@ -41,7 +41,7 @@ func.func @test_transpose_tracks_to_nullifying_diverging_binary(%arg0: tensor<1x %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32> %transpose1 = tosa.transpose %arg1, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32> - %clamp = tosa.clamp %transpose0 {max_fp = 1.0 : f32, min_fp = 0.0 : f32, max_int = 1 : i64, min_int = 0 : i64} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> + %clamp = tosa.clamp %transpose0 {max_val = 1 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %abs = tosa.abs %transpose1 : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %add = tosa.add %clamp, %abs : (tensor<1x3x4x2xi32>, tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> @@ -61,7 +61,7 @@ func.func @test_transpose_tracks_to_nullifying_diverging_binary_with_broadcastin %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32> %transpose1 = tosa.transpose %arg1, %perms0 : (tensor<1x2x1x4xi32>, tensor<4xi32>) -> tensor<1x1x4x2xi32> - %clamp = tosa.clamp %transpose0 {max_fp = 1.0 : f32, min_fp = 0.0 : f32, max_int = 1 : i64, min_int = 0 : i64} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> + %clamp = tosa.clamp %transpose0 {max_val = 1 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %abs = tosa.abs %transpose1 : (tensor<1x1x4x2xi32>) -> tensor<1x1x4x2xi32> %add = tosa.add %clamp, %abs : (tensor<1x3x4x2xi32>, tensor<1x1x4x2xi32>) -> tensor<1x3x4x2xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> @@ -193,6 +193,7 @@ func.func @test_reshape_for_broadcast(%arg0: tensor<4x3x2xi32>) -> tensor<4x3x2x // CHECK-LABEL: @test_resnet18_common_case // COM: note that %74 is now represented by %arg2 +// CHECK-DAG: %[[CONST0:.+]] = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> // CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense_resource : tensor<64xf32>}> : () -> tensor<64xf32> // CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense_resource : tensor<64xf32>}> : () -> tensor<64xf32> // CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1xf32>}> : () -> tensor<1xf32> @@ -205,15 +206,16 @@ func.func @test_reshape_for_broadcast(%arg0: tensor<4x3x2xi32>) -> tensor<4x3x2x // CHECK-DAG: %[[VAL_12:.*]] = tosa.sub %arg2, %[[VAL_11]] : (tensor<1x112x112x64xf32>, tensor<1x1x1x64xf32>) -> tensor<1x112x112x64xf32> // CHECK-DAG: %[[VAL_13:.*]] = tosa.const_shape {value = dense<[1, 1, 1, 64]> : tensor<4xindex>} : () -> !tosa.shape<4> // CHECK-DAG: %[[VAL_14:.*]] = tosa.reshape %[[VAL_9]], %[[VAL_13]] : (tensor<64xf32>, !tosa.shape<4>) -> tensor<1x1x1x64xf32> -// CHECK-DAG: %[[VAL_15:.*]] = tosa.mul %[[VAL_12]], %[[VAL_14]] : (tensor<1x112x112x64xf32>, tensor<1x1x1x64xf32>) -> tensor<1x112x112x64xf32> +// CHECK-DAG: %[[VAL_15:.*]] = tosa.mul %[[VAL_12]], %[[VAL_14]], %[[CONST0]] : (tensor<1x112x112x64xf32>, tensor<1x1x1x64xf32>, tensor<1xi8>) -> tensor<1x112x112x64xf32> // CHECK-DAG: %[[VAL_16:.*]] = tosa.const_shape {value = dense<[1, 1, 1, 64]> : tensor<4xindex>} : () -> !tosa.shape<4> // CHECK-DAG: %[[VAL_17:.*]] = tosa.reshape %[[VAL_4]], %[[VAL_16]] : (tensor<64xf32>, !tosa.shape<4>) -> tensor<1x1x1x64xf32> -// CHECK-DAG: %[[VAL_18:.*]] = tosa.mul %[[VAL_15]], %[[VAL_17]] : (tensor<1x112x112x64xf32>, tensor<1x1x1x64xf32>) -> tensor<1x112x112x64xf32> +// CHECK-DAG: %[[VAL_18:.*]] = tosa.mul %[[VAL_15]], %[[VAL_17]], %[[CONST0]] : (tensor<1x112x112x64xf32>, tensor<1x1x1x64xf32>, tensor<1xi8>) -> tensor<1x112x112x64xf32> // CHECK-DAG: %[[VAL_19:.*]] = tosa.const_shape {value = dense<[1, 1, 1, 64]> : tensor<4xindex>} : () -> !tosa.shape<4> // CHECK-DAG: %[[VAL_20:.*]] = tosa.reshape %[[VAL_3]], %[[VAL_19]] : (tensor<64xf32>, !tosa.shape<4>) -> tensor<1x1x1x64xf32> // CHECK-DAG: %[[VAL_21:.*]] = tosa.add %[[VAL_18]], %[[VAL_20]] : (tensor<1x112x112x64xf32>, tensor<1x1x1x64xf32>) -> tensor<1x112x112x64xf32> -// CHECK-DAG: %[[VAL_22:.*]] = tosa.clamp %[[VAL_21]] {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x112x112x64xf32>) -> tensor<1x112x112x64xf32> +// CHECK-DAG: %[[VAL_22:.*]] = tosa.clamp %[[VAL_21]] {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<1x112x112x64xf32>) -> tensor<1x112x112x64xf32> func.func @test_resnet18_common_case(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %74: tensor<1x112x112x64xf32>) -> tensor<1x112x112x64xf32> { + %shift = "tosa.const"() {value = dense<0> : tensor<1xi8>} : () -> tensor<1xi8> %58 = tosa.const_shape {value = dense<[1, 64, 1, 1]> : tensor<4xindex>} : () -> !tosa.shape<4> %59 = "tosa.const"() <{value = dense_resource : tensor<64xf32>}> : () -> tensor<64xf32> %60 = "tosa.const"() <{value = dense_resource : tensor<64xf32>}> : () -> tensor<64xf32> @@ -228,17 +230,16 @@ func.func @test_resnet18_common_case(%arg0: tensor<64xf32>, %arg1: tensor<64xf32 %79 = tosa.reshape %arg0, %58 : (tensor<64xf32>, !tosa.shape<4>) -> tensor<1x64x1x1xf32> %80 = tosa.sub %75, %79 : (tensor<1x64x112x112xf32>, tensor<1x64x1x1xf32>) -> tensor<1x64x112x112xf32> %81 = tosa.reshape %78, %58 : (tensor<64xf32>, !tosa.shape<4>) -> tensor<1x64x1x1xf32> - %82 = tosa.mul %80, %81 : (tensor<1x64x112x112xf32>, tensor<1x64x1x1xf32>) -> tensor<1x64x112x112xf32> + %82 = tosa.mul %80, %81, %shift : (tensor<1x64x112x112xf32>, tensor<1x64x1x1xf32>, tensor<1xi8>) -> tensor<1x64x112x112xf32> %83 = tosa.reshape %60, %58 : (tensor<64xf32>, !tosa.shape<4>) -> tensor<1x64x1x1xf32> - %84 = tosa.mul %82, %83 : (tensor<1x64x112x112xf32>, tensor<1x64x1x1xf32>) -> tensor<1x64x112x112xf32> + %84 = tosa.mul %82, %83, %shift : (tensor<1x64x112x112xf32>, tensor<1x64x1x1xf32>, tensor<1xi8>) -> tensor<1x64x112x112xf32> %85 = tosa.reshape %59, %58 : (tensor<64xf32>, !tosa.shape<4>) -> tensor<1x64x1x1xf32> %86 = tosa.add %84, %85 : (tensor<1x64x112x112xf32>, tensor<1x64x1x1xf32>) -> tensor<1x64x112x112xf32> - %87 = tosa.clamp %86 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32> + %87 = tosa.clamp %86 {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32> %88 = tosa.transpose %87, %63 : (tensor<1x64x112x112xf32>, tensor<4xi32>) -> tensor<1x112x112x64xf32> return %88 : tensor<1x112x112x64xf32> } - // ----- // CHECK-LABEL: @test_back_to_back_nullifiers @@ -280,7 +281,7 @@ func.func @test_back_to_back_nullifiers_different_transposes(%arg0: tensor<2x3x4 func.func @test_no_transform_if_outside_fan_in_cone(%arg0: tensor<3x3x3x3xi32>) -> (tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>) { %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %0 = tosa.transpose %arg0, %perms0 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> - %clamp = tosa.clamp %0 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32> + %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> %1 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> return %1, %clamp : tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32> @@ -296,7 +297,7 @@ func.func @test_two_different_downstream_converge_to_reshape_same_perms(%arg0: t %0 = "tosa.const"() <{value = dense<[0, 2, 1]> : tensor<3xi32>}> : () -> tensor<3xi32> %shape = tosa.const_shape {value = dense<[1, 64, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> %1 = tosa.reshape %arg0, %shape : (tensor<64xf32>, !tosa.shape<3>) -> tensor<1x64x1xf32> - %2 = tosa.clamp %1 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x64x1xf32>) -> tensor<1x64x1xf32> + %2 = tosa.clamp %1 {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<1x64x1xf32>) -> tensor<1x64x1xf32> %3 = tosa.transpose %1, %0 : (tensor<1x64x1xf32>, tensor<3xi32>) -> tensor<1x1x64xf32> %4 = tosa.transpose %2, %0 : (tensor<1x64x1xf32>, tensor<3xi32>) -> tensor<1x1x64xf32> return %3, %4 : tensor<1x1x64xf32>, tensor<1x1x64xf32> @@ -317,7 +318,7 @@ func.func @test_two_different_downstream_converge_to_reshape_different_perms(%ar %1 = "tosa.const"() <{value = dense<[0, 2, 1]> : tensor<3xi32>}> : () -> tensor<3xi32> %shape = tosa.const_shape {value = dense<[1, 64, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> %2 = tosa.reshape %arg0, %shape : (tensor<64xf32>, !tosa.shape<3>) -> tensor<1x64x1xf32> - %3 = tosa.clamp %2 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x64x1xf32>) -> tensor<1x64x1xf32> + %3 = tosa.clamp %2 {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<1x64x1xf32>) -> tensor<1x64x1xf32> %4 = tosa.transpose %2, %1 : (tensor<1x64x1xf32>, tensor<3xi32>) -> tensor<1x1x64xf32> %5 = tosa.transpose %3, %0 : (tensor<1x64x1xf32>, tensor<3xi32>) -> tensor<64x1x1xf32> return %4, %5 : tensor<1x1x64xf32>, tensor<64x1x1xf32> @@ -335,7 +336,7 @@ func.func @test_two_different_downstream_converge_to_reshape_different_perms(%ar // CHECK: return %[[RES1]], %[[RES2]] func.func @test_outside_perms_usage_of_fan_in(%arg0: tensor<2x3xf32>, %arg1: tensor<3x2xf32>) -> (tensor<2x3xf32>, tensor<3x2xf32>) { %0 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> %1 = tosa.transpose %arg0, %0 : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32> - %2 = tosa.clamp %1 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<3x2xf32>) -> tensor<3x2xf32> + %2 = tosa.clamp %1 {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<3x2xf32>) -> tensor<3x2xf32> %3 = tosa.transpose %2, %0 : (tensor<3x2xf32>, tensor<2xi32>) -> tensor<2x3xf32> %4 = tosa.add %arg1, %2 : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32> return %3, %4: tensor<2x3xf32>, tensor<3x2xf32> @@ -352,7 +353,7 @@ func.func @test_outside_perms_usage_of_fan_in(%arg0: tensor<2x3xf32>, %arg1: ten func.func @test_use_present_in_another_valid_perms_fan_in(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>) -> (tensor<2x3xf32>, tensor<2x3xf32>) { %0 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> %1 = tosa.transpose %arg0, %0 : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32> - %2 = tosa.clamp %1 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<3x2xf32>) -> tensor<3x2xf32> + %2 = tosa.clamp %1 {max_val = 3.40282347E+38 : f32, min_val = 0.000000e+00 : f32} : (tensor<3x2xf32>) -> tensor<3x2xf32> %3 = tosa.transpose %2, %0 : (tensor<3x2xf32>, tensor<2xi32>) -> tensor<2x3xf32> %4 = tosa.transpose %arg1, %0 : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32> %5 = tosa.add %4, %2 : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32> @@ -389,7 +390,7 @@ func.func @test_two_same_perms_fan_in_but_one_doesnt_convert_dependents(%arg0: t func.func @test_direct_use_in_other_transpose_with_same_perms(%arg0: tensor<3x3x3x3xi32>) -> (tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>) { %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %0 = tosa.transpose %arg0, %perms0 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> - %clamp = tosa.clamp %0 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32> + %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> %1 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> %2 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> @@ -413,12 +414,12 @@ func.func @test_const_transpose() -> tensor<2x3xi32> { // CHECK-LABEL: @test_transpose_tracks_to_const_single_step // CHECK: %[[NEW_CONST:.*]] = "tosa.const"() <{value = dense<0> : tensor<1x2x3x4xi32>}> : () -> tensor<1x2x3x4xi32> -// CHECK: %[[NEW_CLAMP:.*]] = tosa.clamp %[[NEW_CONST]] {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> +// CHECK: %[[NEW_CLAMP:.*]] = tosa.clamp %[[NEW_CONST]] {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> // CHECK-NOT: tosa.transpose // CHECK: return %[[NEW_CLAMP]] func.func @test_transpose_tracks_to_const_single_step() -> tensor<1x2x3x4xi32> { %0 = "tosa.const"() {value = dense<0> : tensor<1x3x4x2xi32>} : () -> tensor<1x3x4x2xi32> - %clamp = tosa.clamp %0 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> + %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> %1 = tosa.transpose %clamp, %perms1 : (tensor<1x3x4x2xi32>, tensor<4xi32>) -> tensor<1x2x3x4xi32> return %1 : tensor<1x2x3x4xi32> @@ -428,14 +429,14 @@ func.func @test_transpose_tracks_to_const_single_step() -> tensor<1x2x3x4xi32> { // CHECK-LABEL: @test_static_unary_path_to_const // CHECK: %[[NEW_CONST:.*]] = "tosa.const"() <{value = dense<1> : tensor<1x2x3x4xi32>}> : () -> tensor<1x2x3x4xi32> -// CHECK: %[[NEW_CLAMP:.*]] = tosa.clamp %[[NEW_CONST]] {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> +// CHECK: %[[NEW_CLAMP:.*]] = tosa.clamp %[[NEW_CONST]] {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> // CHECK: %[[NEW_ABS:.*]] = tosa.abs %[[NEW_CLAMP]] : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> // CHECK: %[[NEW_NOT:.*]] = tosa.bitwise_not %[[NEW_ABS]] : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> // CHECK: return %[[NEW_NOT]] func.func @test_static_unary_path_to_const() -> tensor<1x2x3x4xi32> { %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %0 = "tosa.const"() {value = dense<1> : tensor<1x3x4x2xi32>} : () -> tensor<1x3x4x2xi32> - %clamp = tosa.clamp %0 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> + %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %abs = tosa.abs %clamp : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %bitwise_not = tosa.bitwise_not %abs : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> @@ -449,7 +450,7 @@ func.func @test_static_unary_path_to_const() -> tensor<1x2x3x4xi32> { // CHECK: %[[NEW_CONST:.*]] = "tosa.const"() // CHECK-SAME{LITERAL}: dense<[[[[1, 3, 5, 7], [9, 11, 13, 15], [17, 19, 21, 23]], [[2, 4, 6, 8], [10, 12, 14, 16], [18, 20, 22, 24]]]]> // CHECK: tensor<1x2x3x4xi32>}> : () -> tensor<1x2x3x4xi32> -// CHECK: %[[NEW_CLAMP:.*]] = tosa.clamp %arg0 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> +// CHECK: %[[NEW_CLAMP:.*]] = tosa.clamp %arg0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> // CHECK: %[[NEW_ABS:.*]] = tosa.abs %[[NEW_CONST]] : (tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> // CHECK: %[[NEW_ADD:.*]] = tosa.add %[[NEW_ABS]], %[[NEW_CLAMP]] : (tensor<1x2x3x4xi32>, tensor<1x2x3x4xi32>) -> tensor<1x2x3x4xi32> // CHECK: return %[[NEW_ADD]] @@ -459,7 +460,7 @@ func.func @test_static_diverges_to_non_splat_const_and_nullifying(%arg0: tensor< %const = "tosa.const"() {value = dense<[[[[1, 2], [3, 4], [5, 6], [7, 8]], [[9, 10], [11, 12], [13, 14], [15, 16]], [[17, 18], [19, 20], [21, 22], [23, 24]]]]> : tensor<1x3x4x2xi32>} : () -> tensor<1x3x4x2xi32> - %clamp = tosa.clamp %transpose0 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> + %clamp = tosa.clamp %transpose0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %abs = tosa.abs %const : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %add = tosa.add %abs, %clamp : (tensor<1x3x4x2xi32>, tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %perms2 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> @@ -475,7 +476,7 @@ func.func @test_static_diverges_to_non_splat_const_and_nullifying(%arg0: tensor< func.func @test_multi_downstream_both_nullify(%arg0: tensor<3x3x3x3xi32>) -> (tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>) { %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %0 = tosa.transpose %arg0, %perms0 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> - %clamp = tosa.clamp %0 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32> + %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> %1 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> %2 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> @@ -495,7 +496,7 @@ func.func @test_multi_downstream_both_nullify(%arg0: tensor<3x3x3x3xi32>) -> (te func.func @test_multi_downstream_one_nullifies_upstream_other_does_not(%arg0: tensor<3x3x3x3xi32>) -> (tensor<3x3x3x3xi32>, tensor<3x3x3x3xi32>) { %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %0 = tosa.transpose %arg0, %perms0 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> - %clamp = tosa.clamp %0 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32> + %clamp = tosa.clamp %0 {max_val = 2147483647 : i32, min_val = 0 : i32} : (tensor<3x3x3x3xi32>) -> tensor<3x3x3x3xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> %1 = tosa.transpose %clamp, %perms1 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> %2 = tosa.transpose %clamp, %perms0 : (tensor<3x3x3x3xi32>, tensor<4xi32>) -> tensor<3x3x3x3xi32> @@ -536,7 +537,7 @@ func.func @test_transpose_tracks_to_nullifying_diverging_binary_unknown_dim_repl %perms0 = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<1x?x3x4xi32>, tensor<4xi32>) -> tensor %transpose1 = tosa.transpose %arg1, %perms0 : (tensor<1x2x?x4xi32>, tensor<4xi32>) -> tensor<1x?x?x2xi32> - %clamp = tosa.clamp %transpose0 {min_int = 0 : i64, max_int = 1 : i64, min_fp = 0.0 : f64, max_fp = 1.0 : f64} : (tensor) -> tensor + %clamp = tosa.clamp %transpose0 {min_val = 0 : i32, max_val = 1 : i32} : (tensor) -> tensor %abs = tosa.abs %transpose1 : (tensor<1x?x?x2xi32>) -> tensor<1x?x?x2xi32> %add = tosa.add %clamp, %abs : (tensor, tensor<1x?x?x2xi32>) -> tensor<1x3x4x2xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> @@ -571,7 +572,7 @@ func.func @test_unimplemented_non_const_perms(%perms: tensor<2xi32>) -> tensor) -> tensor<1x2x4x3xi32> { %perms0 = "tosa.const"() {value = dense<[0, 3, 2, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x4x3x2xi32> - %clamp = tosa.clamp %0 {min_int = 0 : i64, max_int = 1 : i64, min_fp = 0.0 : f64, max_fp = 1.0 : f64} : (tensor<1x4x3x2xi32>) -> tensor<1x4x3x2xi32> + %clamp = tosa.clamp %0 {min_val = 0 : i32, max_val = 1 : i32} : (tensor<1x4x3x2xi32>) -> tensor<1x4x3x2xi32> %perms1 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> %1 = tosa.transpose %clamp, %perms1 : (tensor<1x4x3x2xi32>, tensor<4xi32>) -> tensor<1x2x4x3xi32> return %1 : tensor<1x2x4x3xi32> @@ -653,7 +654,7 @@ func.func @test_unimplemented_static_diverges_to_one_nullifying_one_non_nullifyi %perms1 = "tosa.const"() {value = dense<[0, 3, 2, 1]> : tensor<4xi32>} : () -> tensor<4xi32> %transpose0 = tosa.transpose %arg0, %perms0 : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32> %transpose1 = tosa.transpose %arg1, %perms1 : (tensor<1x2x4x3xi32>, tensor<4xi32>) -> tensor<1x3x4x2xi32> - %clamp = tosa.clamp %transpose0 {min_int = 0 : i64, max_int = 1 : i64, min_fp = 0.0 : f64, max_fp = 1.0 : f64} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> + %clamp = tosa.clamp %transpose0 {min_val = 0 : i32, max_val = 1 : i32} : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %abs = tosa.abs %transpose1 : (tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %add = tosa.add %clamp, %abs : (tensor<1x3x4x2xi32>, tensor<1x3x4x2xi32>) -> tensor<1x3x4x2xi32> %perms2 = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index 93581cbfbe5e4..f17d917ca521e 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -2984,10 +2984,10 @@ func.func @insert_multiple_poison_idx(%a: vector<4x5x8xf32>, %b: vector<8xf32>) // ----- -// CHECK-LABEL: @contiguous_extract_strided_slices_to_extract +// CHECK-LABEL: @contiguous_extract_strided_slices_to_extract_sizes_and_outer_source_dims_overlap // CHECK: %[[EXTRACT:.+]] = vector.extract {{.*}}[0, 0, 0, 0, 0] : vector<4xi32> from vector<8x1x2x1x1x4xi32> // CHECK-NEXT: return %[[EXTRACT]] : vector<4xi32> -func.func @contiguous_extract_strided_slices_to_extract(%arg0 : vector<8x1x2x1x1x4xi32>) -> vector<4xi32> { +func.func @contiguous_extract_strided_slices_to_extract_sizes_and_outer_source_dims_overlap(%arg0 : vector<8x1x2x1x1x4xi32>) -> vector<4xi32> { %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0, 0], sizes = [1, 1, 1, 1, 1, 4], strides = [1, 1, 1, 1, 1, 1]} : vector<8x1x2x1x1x4xi32> to vector<1x1x1x1x1x4xi32> %2 = vector.shape_cast %1 : vector<1x1x1x1x1x4xi32> to vector<4xi32> return %2 : vector<4xi32> @@ -2995,6 +2995,17 @@ func.func @contiguous_extract_strided_slices_to_extract(%arg0 : vector<8x1x2x1x1 // ----- +// CHECK-LABEL: @contiguous_extract_strided_slices_to_extract_sizes_and_outer_source_dims_no_overlap +// CHECK: %[[EXTRACT:.+]] = vector.extract {{.*}}[0, 0] : vector<4xi32> from vector<8x2x4xi32> +// CHECK-NEXT: return %[[EXTRACT]] : vector<4xi32> +func.func @contiguous_extract_strided_slices_to_extract_sizes_and_outer_source_dims_no_overlap(%arg0 : vector<8x2x4xi32>) -> vector<4xi32> { + %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0], sizes = [1, 1], strides = [1, 1]} : vector<8x2x4xi32> to vector<1x1x4xi32> + %2 = vector.shape_cast %1 : vector<1x1x4xi32> to vector<4xi32> + return %2 : vector<4xi32> +} + +// ----- + // CHECK-LABEL: @contiguous_extract_strided_slices_to_extract_shorter_size_list // CHECK: %[[EXTRACT:.+]] = vector.extract {{.*}}[0, 0, 0, 0] : vector<1x4xi32> from vector<8x1x2x1x1x4xi32> // CHECK-NEXT: return %[[EXTRACT]] : vector<1x4xi32> diff --git a/mlir/test/Interfaces/InferIntRangeInterface/infer-int-range-test-ops.mlir b/mlir/test/Interfaces/InferIntRangeInterface/infer-int-range-test-ops.mlir index 1ec3441b1fde8..b98e8b07db5ce 100644 --- a/mlir/test/Interfaces/InferIntRangeInterface/infer-int-range-test-ops.mlir +++ b/mlir/test/Interfaces/InferIntRangeInterface/infer-int-range-test-ops.mlir @@ -154,3 +154,33 @@ func.func @dont_propagate_across_infinite_loop() -> index { return %2 : index } +// CHECK-LABEL: @propagate_from_block_to_iterarg +func.func @propagate_from_block_to_iterarg(%arg0: index, %arg1: i1) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = scf.if %arg1 -> (index) { + %1 = scf.if %arg1 -> (index) { + scf.yield %arg0 : index + } else { + scf.yield %arg0 : index + } + scf.yield %1 : index + } else { + scf.yield %c1 : index + } + scf.for %arg2 = %c0 to %arg0 step %c1 { + scf.if %arg1 { + %1 = arith.subi %0, %c1 : index + %2 = arith.muli %0, %1 : index + %3 = arith.addi %2, %c1 : index + scf.for %arg3 = %c0 to %3 step %c1 { + %4 = arith.cmpi uge, %arg3, %c1 : index + // CHECK-NOT: scf.if %false + scf.if %4 { + "test.foo"() : () -> () + } + } + } + } + return +} diff --git a/mlir/test/Pass/pipeline-invalid.mlir b/mlir/test/Pass/pipeline-invalid.mlir index f9dd4c29dd7f0..948a13384bc75 100644 --- a/mlir/test/Pass/pipeline-invalid.mlir +++ b/mlir/test/Pass/pipeline-invalid.mlir @@ -1,8 +1,8 @@ // RUN: mlir-opt --no-implicit-module \ -// RUN: --pass-pipeline='any(buffer-deallocation)' --verify-diagnostics \ +// RUN: --pass-pipeline='any(test-function-pass)' --verify-diagnostics \ // RUN: --split-input-file %s -// Note: "buffer-deallocation" is a function pass. Any other function pass could +// Note: "test-function-pass" is a function pass. Any other function pass could // be used for this test. // expected-error@below {{trying to schedule a pass on an operation not marked as 'IsolatedFromAbove'}} diff --git a/mlir/test/Target/LLVMIR/external-func-dialect-attr.mlir b/mlir/test/Target/LLVMIR/external-func-dialect-attr.mlir index 6605f10f128e6..459859f5be47b 100644 --- a/mlir/test/Target/LLVMIR/external-func-dialect-attr.mlir +++ b/mlir/test/Target/LLVMIR/external-func-dialect-attr.mlir @@ -6,6 +6,6 @@ module { llvm.func external @f() attributes { nvvm.minctasm = 10 : i32 } - // CHECK: !nvvm.annotations = !{![[NVVM:[0-9]+]]} - // CHECK: ![[NVVM]] = !{ptr @f, !"minctasm", i32 10} + // CHECK: declare void @f() #[[ATTRS:[0-9]+]] + // CHECK: attributes #[[ATTRS]] = { "nvvm.minctasm"="10" } } diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index 52aa69f4c481f..7f9a3ba79d724 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -84,7 +84,7 @@ llvm.mlir.global external @explicit_undef() : i32 { llvm.return %0 : i32 } -// CHECK: @int_gep = internal constant ptr getelementptr (i32, ptr @i32_global, i32 2) +// CHECK: @int_gep = internal constant ptr getelementptr (i8, ptr @i32_global, i64 8) llvm.mlir.global internal constant @int_gep() : !llvm.ptr { %addr = llvm.mlir.addressof @i32_global : !llvm.ptr %_c0 = llvm.mlir.constant(2: i32) : i32 @@ -2347,7 +2347,7 @@ llvm.func @readonly_function(%arg0: !llvm.ptr {llvm.readonly}) llvm.func @arg_mem_none_func() attributes { memory_effects = #llvm.memory_effects} -// CHECK: attributes #[[ATTR]] = { memory(readwrite, argmem: none) } +// CHECK: attributes #[[ATTR]] = { memory(readwrite, argmem: none, errnomem: none) } // ----- @@ -2355,7 +2355,7 @@ llvm.func @arg_mem_none_func() attributes { llvm.func @readwrite_func() attributes { memory_effects = #llvm.memory_effects} -// CHECK: attributes #[[ATTR]] = { memory(readwrite) } +// CHECK: attributes #[[ATTR]] = { memory(readwrite, errnomem: none) } // ----- @@ -2613,11 +2613,11 @@ llvm.func @mem_effects_call() { // CHECK: #[[ATTRS_0]] // CHECK-SAME: memory(none) // CHECK: #[[ATTRS_1]] -// CHECK-SAME: memory(read, argmem: none, inaccessiblemem: write) +// CHECK-SAME: memory(read, argmem: none, inaccessiblemem: write, errnomem: none) // CHECK: #[[ATTRS_2]] -// CHECK-SAME: memory(read, inaccessiblemem: write) +// CHECK-SAME: memory(read, inaccessiblemem: write, errnomem: none) // CHECK: #[[ATTRS_3]] -// CHECK-SAME: memory(readwrite, argmem: read) +// CHECK-SAME: memory(readwrite, argmem: read, errnomem: none) // ----- diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 970cac707b058..5ab593452ab66 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -623,27 +623,25 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_max_blocks = 8} { llvm.return } -// CHECK: define ptx_kernel void @kernel_func -// CHECK: !nvvm.annotations = -// CHECK: {ptr @kernel_func, !"cluster_max_blocks", i32 8} +// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]] +// CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="8" } + // ----- llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.minctasm = 16} { llvm.return } -// CHECK: define ptx_kernel void @kernel_func -// CHECK: !nvvm.annotations = -// CHECK: {ptr @kernel_func, !"minctasm", i32 16} +// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]] +// CHECK: attributes #[[ATTR0]] = { "nvvm.minctasm"="16" } // ----- llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxnreg = 16} { llvm.return } -// CHECK: define ptx_kernel void @kernel_func -// CHECK: !nvvm.annotations = -// CHECK: {ptr @kernel_func, !"maxnreg", i32 16} +// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]] +// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="16" } // ----- llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array, @@ -651,13 +649,12 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array !llvm.ptr diff --git a/mlir/test/mlir-tblgen/gen-dialect-doc.td b/mlir/test/mlir-tblgen/gen-dialect-doc.td index 79d755111e8f6..72916704369ee 100644 --- a/mlir/test/mlir-tblgen/gen-dialect-doc.td +++ b/mlir/test/mlir-tblgen/gen-dialect-doc.td @@ -36,7 +36,15 @@ def ACOp : Op]>; } -def AEOp : Op]>; +def AEOp : Op { + let summary = "Op with a summary"; + let description = "Op with a description"; + let arguments = (ins ConfinedType($_self)">]>:$tensor, + I16Attr:$int_attr); + let results = (outs + ConfinedType($_self)">]>:$output + ); +} def TestAttr : DialectAttr> { let summary = "attribute summary"; @@ -81,11 +89,33 @@ def TestEnum : } // CHECK: Dialect without a [TOC] here. -// CHECK: TOC added by tool. -// CHECK: [TOC] +// CHECK-NEXT: TOC added by tool. +// CHECK-EMPTY: +// CHECK-NEXT: [TOC] +// CHECK-EMPTY: // CHECK-NOT: [TOC] -// CHECK: test.e + +// CHECK: test.e +// CHECK-EMPTY: +// CHECK-NEXT: _Op with a summary_ +// CHECK-EMPTY: +// CHECK-NEXT: Op with a description +// CHECK-EMPTY: + +// CHECK: Operands: +// CHECK-EMPTY: +// CHECK-NEXT: | Operand | Description | +// CHECK-NEXT: | :-----: | ----------- | +// CHECK-NEXT: | `tensor` | | +// CHECK-EMPTY: +// CHECK-NEXT: Results: +// CHECK-EMPTY: +// CHECK-NEXT: | Result | Description | +// CHECK-NEXT: | :----: | ----------- | +// CHECK-NEXT: | `output` | | +// CHECK-EMPTY: + // CHECK: Group of ops // CHECK: test.a // CHECK: test.d @@ -96,9 +126,11 @@ def TestEnum : // CHECK: Interfaces: `NoMemoryEffect (MemoryEffectOpInterface)` // CHECK: Effects: `MemoryEffects::Effect{}` -// CHECK: ## Attribute constraints -// CHECK: ### attribute summary -// CHECK: attribute description +// CHECK: ## Attribute constraints +// CHECK-EMPTY: +// CHECK-NEXT: ### attribute summary +// CHECK-EMPTY: +// CHECK: attribute description // CHECK: TestAttrDefAttr // CHECK: Syntax: @@ -120,15 +152,20 @@ def TestEnum : // CHECK: Syntax: // CHECK: !test.test_type_def_params -// CHECK: ## Enums -// CHECK: ### TestEnum -// CHECK: enum summary -// CHECK: #### Cases: -// CHECK: | Symbol | Value | String | -// CHECK: | :----: | :---: | ------ | -// CHECK: | First | `0` | first | -// CHECK: | Second | `1` | second | -// CHECK: | Third | `2` | third | +// CHECK: ## Enums +// CHECK-EMPTY: +// CHECK-NEXT: ### TestEnum +// CHECK-EMPTY: +// CHECK-NEXT: _Enum summary_ +// CHECK-EMPTY: +// CHECK-NEXT: #### Cases: +// CHECK-EMPTY: +// CHECK-NEXT: | Symbol | Value | String | +// CHECK-NEXT: | :----: | :---: | ------ | +// CHECK-NEXT: | First | `0` | first | +// CHECK-NEXT: | Second | `1` | second | +// CHECK-NEXT: | Third | `2` | third | +// CHECK-EMPTY: def Toc_Dialect : Dialect { let name = "test_toc"; diff --git a/mlir/test/mlir-tblgen/gen-pass-doc.td b/mlir/test/mlir-tblgen/gen-pass-doc.td new file mode 100644 index 0000000000000..fd8e9cccb5fb5 --- /dev/null +++ b/mlir/test/mlir-tblgen/gen-pass-doc.td @@ -0,0 +1,33 @@ +// RUN: mlir-tblgen -gen-pass-doc -I %S/../../include -dialect=test %s | FileCheck %s + +include "mlir/Pass/PassBase.td" + +def TestPassDocA : Pass<"test-pass-doc-a"> { + let summary = "pass summary"; + let description = [{ + Pass description + }]; + + let options = [ + ListOption<"option", "option", "std::string", "pass option"> + ]; +} + +def TestPassDocB : Pass<"test-pass-doc-b"> { +} + +// Ensure there are empty lines between individual pass docs. + +// CHECK: `-test-pass-doc-a` +// CHECK-EMPTY: +// CHECK-NEXT: _Pass summary_ +// CHECK-EMPTY: +// CHECK-NEXT: Pass description +// CHECK-EMPTY: +// CHECK-NEXT: Options +// CHECK-EMPTY: +// CHECK-NEXT: ``` +// CHECK-NEXT: -option : pass option +// CHECK-NEXT: ``` +// CHECK-EMPTY: +// CHECK-NEXT: `-test-pass-doc-b` diff --git a/mlir/test/python/ir/module.py b/mlir/test/python/ir/module.py index ecafcb46af217..6065e59fd6ed9 100644 --- a/mlir/test/python/ir/module.py +++ b/mlir/test/python/ir/module.py @@ -1,6 +1,7 @@ # RUN: %PYTHON %s | FileCheck %s import gc +from tempfile import NamedTemporaryFile from mlir.ir import * @@ -27,6 +28,24 @@ def testParseSuccess(): print(str(module)) +# Verify successful parse from file. +# CHECK-LABEL: TEST: testParseFromFileSuccess +# CHECK: module @successfulParse +@run +def testParseFromFileSuccess(): + ctx = Context() + with NamedTemporaryFile(mode="w") as tmp_file: + tmp_file.write(r"""module @successfulParse {}""") + tmp_file.flush() + module = Module.parseFile(tmp_file.name, ctx) + assert module.context is ctx + print("CLEAR CONTEXT") + ctx = None # Ensure that module captures the context. + gc.collect() + module.operation.verify() + print(str(module)) + + # Verify parse error. # CHECK-LABEL: TEST: testParseError # CHECK: testParseError: < diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp index 1c394f5680a5c..43d406e4340f7 100644 --- a/mlir/tools/mlir-tblgen/OpDocGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp @@ -54,12 +54,12 @@ cl::opt allowHugoSpecificFeatures( cl::cat(docCat)); void mlir::tblgen::emitSummary(StringRef summary, raw_ostream &os) { - if (!summary.empty()) { - StringRef trimmed = summary.trim(); - char first = std::toupper(trimmed.front()); - StringRef rest = trimmed.drop_front(); - os << "\n_" << first << rest << "_\n\n"; - } + if (summary.empty()) + return; + StringRef trimmed = summary.trim(); + char first = std::toupper(trimmed.front()); + StringRef rest = trimmed.drop_front(); + os << "\n_" << first << rest << "_\n"; } // Emit the description by aligning the text to the left per line (e.g., @@ -69,6 +69,9 @@ void mlir::tblgen::emitSummary(StringRef summary, raw_ostream &os) { // in a way the user wanted but has some additional indenting due to being // nested in the op definition. void mlir::tblgen::emitDescription(StringRef description, raw_ostream &os) { + if (description.empty()) + return; + os << "\n"; raw_indented_ostream ros(os); StringRef trimmed = description.rtrim(" \t"); ros.printReindented(trimmed); @@ -80,6 +83,7 @@ void mlir::tblgen::emitDescriptionComment(StringRef description, raw_ostream &os, StringRef prefix) { if (description.empty()) return; + os << "\n"; raw_indented_ostream ros(os); StringRef trimmed = description.rtrim(" \t"); ros.printReindented(trimmed, (Twine(prefix) + "/// ").str()); @@ -87,22 +91,14 @@ void mlir::tblgen::emitDescriptionComment(StringRef description, ros << "\n"; } -// Emits `str` with trailing newline if not empty. -static void emitIfNotEmpty(StringRef str, raw_ostream &os) { - if (!str.empty()) { - emitDescription(str, os); - os << "\n"; - } -} - /// Emit the given named constraint. template static void emitNamedConstraint(const T &it, raw_ostream &os) { if (!it.name.empty()) os << "| `" << it.name << "`"; else - os << "«unnamed»"; - os << " | " << it.constraint.getSummary() << "\n"; + os << "| «unnamed»"; + os << " | " << it.constraint.getSummary() << " |\n"; } //===----------------------------------------------------------------------===// @@ -112,6 +108,8 @@ static void emitNamedConstraint(const T &it, raw_ostream &os) { /// Emit the assembly format of an operation. static void emitAssemblyFormat(StringRef opName, StringRef format, raw_ostream &os) { + if (format.empty()) + return; os << "\nSyntax:\n\n```\noperation ::= `" << opName << "` "; // Print the assembly format aligned. @@ -124,7 +122,7 @@ static void emitAssemblyFormat(StringRef opName, StringRef format, if (!formatChunk.empty()) os.indent(indent) << formatChunk << "\n"; } while (!split.second.empty()); - os << "```\n\n"; + os << "```\n"; } /// Place `text` between backticks so that the Markdown processor renders it as @@ -199,7 +197,7 @@ static void emitOpDoc(const Operator &op, raw_ostream &os) { std::string classNameStr = op.getQualCppClassName(); StringRef className = classNameStr; (void)className.consume_front(stripPrefix); - os << formatv("### `{0}` ({1})\n", op.getOperationName(), className); + os << formatv("\n### `{0}` ({1})\n", op.getOperationName(), className); // Emit the summary, syntax, and description if present. if (op.hasSummary()) @@ -281,8 +279,8 @@ static void emitSourceLink(StringRef inputFilename, raw_ostream &os) { StringRef inputFromMlirInclude = inputFilename.substr(pathBegin); - os << "[source](https://github.com/llvm/llvm-project/blob/main/" - << inputFromMlirInclude << ")\n\n"; + os << "\n[source](https://github.com/llvm/llvm-project/blob/main/" + << inputFromMlirInclude << ")\n"; } static void emitOpDoc(const RecordKeeper &records, raw_ostream &os) { @@ -299,9 +297,9 @@ static void emitOpDoc(const RecordKeeper &records, raw_ostream &os) { //===----------------------------------------------------------------------===// static void emitAttrDoc(const Attribute &attr, raw_ostream &os) { - os << "### " << attr.getSummary() << "\n\n"; + os << "\n### " << attr.getSummary() << "\n"; emitDescription(attr.getDescription(), os); - os << "\n\n"; + os << "\n"; } //===----------------------------------------------------------------------===// @@ -309,9 +307,9 @@ static void emitAttrDoc(const Attribute &attr, raw_ostream &os) { //===----------------------------------------------------------------------===// static void emitTypeDoc(const Type &type, raw_ostream &os) { - os << "### " << type.getSummary() << "\n\n"; + os << "\n### " << type.getSummary() << "\n"; emitDescription(type.getDescription(), os); - os << "\n\n"; + os << "\n"; } //===----------------------------------------------------------------------===// @@ -342,11 +340,11 @@ static void emitAttrOrTypeDefAssemblyFormat(const AttrOrTypeDef &def, } static void emitAttrOrTypeDefDoc(const AttrOrTypeDef &def, raw_ostream &os) { - os << formatv("### {0}\n", def.getCppClassName()); + os << formatv("\n### {0}\n", def.getCppClassName()); // Emit the summary if present. if (def.hasSummary()) - os << "\n" << def.getSummary() << "\n"; + emitSummary(def.getSummary(), os); // Emit the syntax if present. if (def.getMnemonic() && !def.hasCustomAssemblyFormat()) @@ -354,7 +352,6 @@ static void emitAttrOrTypeDefDoc(const AttrOrTypeDef &def, raw_ostream &os) { // Emit the description if present. if (def.hasDescription()) { - os << "\n"; mlir::tblgen::emitDescription(def.getDescription(), os); } @@ -363,11 +360,11 @@ static void emitAttrOrTypeDefDoc(const AttrOrTypeDef &def, raw_ostream &os) { if (!parameters.empty()) { os << "\n#### Parameters:\n\n"; os << "| Parameter | C++ type | Description |\n" - << "| :-------: | :-------: | ----------- |\n"; + << "| :-------: | :-------: | ----------- |"; for (const auto &it : parameters) { auto desc = it.getSummary(); - os << "| " << it.getName() << " | `" << it.getCppType() << "` | " - << (desc ? *desc : "") << " |\n"; + os << "\n| " << it.getName() << " | `" << it.getCppType() << "` | " + << (desc ? *desc : "") << " |"; } } @@ -388,20 +385,19 @@ static void emitAttrOrTypeDefDoc(const RecordKeeper &records, raw_ostream &os, //===----------------------------------------------------------------------===// static void emitEnumDoc(const EnumAttr &def, raw_ostream &os) { - os << formatv("### {0}\n", def.getEnumClassName()); + os << formatv("\n### {0}\n", def.getEnumClassName()); // Emit the summary if present. - if (!def.getSummary().empty()) - os << "\n" << def.getSummary() << "\n"; + emitSummary(def.getSummary(), os); // Emit case documentation. std::vector cases = def.getAllCases(); os << "\n#### Cases:\n\n"; os << "| Symbol | Value | String |\n" - << "| :----: | :---: | ------ |\n"; + << "| :----: | :---: | ------ |"; for (const auto &it : cases) { - os << "| " << it.getSymbol() << " | `" << it.getValue() << "` | " - << it.getStr() << " |\n"; + os << "\n| " << it.getSymbol() << " | `" << it.getValue() << "` | " + << it.getStr() << " |"; } os << "\n"; @@ -447,7 +443,7 @@ static void emitBlock(ArrayRef attributes, StringRef inputFilename, ArrayRef types, ArrayRef typeDefs, ArrayRef enums, raw_ostream &os) { if (!ops.empty()) { - os << "## Operations\n\n"; + os << "\n## Operations\n"; emitSourceLink(inputFilename, os); for (const OpDocGroup &grouping : ops) { bool nested = !grouping.summary.empty(); @@ -455,9 +451,9 @@ static void emitBlock(ArrayRef attributes, StringRef inputFilename, nested, [&](raw_ostream &os) { if (nested) { - os << "## " << StringRef(grouping.summary).trim() << "\n\n"; + os << "\n## " << StringRef(grouping.summary).trim() << "\n"; emitDescription(grouping.description, os); - os << "\n\n"; + os << "\n"; } for (const Operator &op : grouping.ops) { emitOpDoc(op, os); @@ -468,32 +464,32 @@ static void emitBlock(ArrayRef attributes, StringRef inputFilename, } if (!attributes.empty()) { - os << "## Attribute constraints\n\n"; + os << "\n## Attribute constraints\n"; for (const Attribute &attr : attributes) emitAttrDoc(attr, os); } if (!attrDefs.empty()) { - os << "## Attributes\n\n"; + os << "\n## Attributes\n"; for (const AttrDef &def : attrDefs) emitAttrOrTypeDefDoc(def, os); } // TODO: Add link between use and def for types if (!types.empty()) { - os << "## Type constraints\n\n"; + os << "\n## Type constraints\n"; for (const Type &type : types) emitTypeDoc(type, os); } if (!typeDefs.empty()) { - os << "## Types\n\n"; + os << "\n## Types\n"; for (const TypeDef &def : typeDefs) emitAttrOrTypeDefDoc(def, os); } if (!enums.empty()) { - os << "## Enums\n\n"; + os << "\n## Enums\n"; for (const EnumAttr &def : enums) emitEnumDoc(def, os); } @@ -504,14 +500,14 @@ static void emitDialectDoc(const Dialect &dialect, StringRef inputFilename, ArrayRef attrDefs, ArrayRef ops, ArrayRef types, ArrayRef typeDefs, ArrayRef enums, raw_ostream &os) { - os << "# '" << dialect.getName() << "' Dialect\n\n"; - emitIfNotEmpty(dialect.getSummary(), os); - emitIfNotEmpty(dialect.getDescription(), os); + os << "\n# '" << dialect.getName() << "' Dialect\n"; + emitSummary(dialect.getSummary(), os); + emitDescription(dialect.getDescription(), os); // Generate a TOC marker except if description already contains one. Regex r("^[[:space:]]*\\[TOC\\]$", Regex::RegexFlags::Newline); if (!r.match(dialect.getDescription())) - os << "[TOC]\n\n"; + os << "\n[TOC]\n"; emitBlock(attributes, inputFilename, attrDefs, ops, types, typeDefs, enums, os); diff --git a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp index 1f1b1d9a34039..dcd68e6c2d636 100644 --- a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp +++ b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp @@ -627,8 +627,8 @@ static void emitInterfaceDoc(const Record &interfaceDef, raw_ostream &os) { Interface interface(&interfaceDef); // Emit the interface name followed by the description. - os << "## " << interface.getName() << " (`" << interfaceDef.getName() - << "`)\n\n"; + os << "\n## " << interface.getName() << " (`" << interfaceDef.getName() + << "`)\n"; if (auto description = interface.getDescription()) mlir::tblgen::emitDescription(*description, os); @@ -636,7 +636,7 @@ static void emitInterfaceDoc(const Record &interfaceDef, raw_ostream &os) { os << "\n### Methods:\n"; for (const auto &method : interface.getMethods()) { // Emit the method name. - os << "#### `" << method.getName() << "`\n\n```c++\n"; + os << "\n#### `" << method.getName() << "`\n\n```c++\n"; // Emit the method signature. if (method.isStatic()) @@ -656,13 +656,13 @@ static void emitInterfaceDoc(const Record &interfaceDef, raw_ostream &os) { if (!method.getBody()) os << "\nNOTE: This method *must* be implemented by the user."; - os << "\n\n"; + os << "\n"; } } bool InterfaceGenerator::emitInterfaceDocs() { os << "\n"; - os << "# " << interfaceBaseType << " definitions\n"; + os << "\n# " << interfaceBaseType << " definitions\n"; for (const auto *def : defs) emitInterfaceDoc(*def, os); diff --git a/mlir/tools/mlir-tblgen/PassDocGen.cpp b/mlir/tools/mlir-tblgen/PassDocGen.cpp index a2cb514ece3eb..456f9ceffeb9b 100644 --- a/mlir/tools/mlir-tblgen/PassDocGen.cpp +++ b/mlir/tools/mlir-tblgen/PassDocGen.cpp @@ -22,14 +22,14 @@ using llvm::RecordKeeper; /// Emit the documentation for the given pass. static void emitDoc(const Pass &pass, raw_ostream &os) { - os << llvm::formatv("### `-{0}`\n", pass.getArgument()); + os << llvm::formatv("\n### `-{0}`\n", pass.getArgument()); emitSummary(pass.getSummary(), os); emitDescription(pass.getDescription(), os); // Handle the options of the pass. ArrayRef options = pass.getOptions(); if (!options.empty()) { - os << "\n#### Options\n```\n"; + os << "\n#### Options\n\n```\n"; size_t longestOption = 0; for (const PassOption &option : options) longestOption = std::max(option.getArgument().size(), longestOption); @@ -44,7 +44,7 @@ static void emitDoc(const Pass &pass, raw_ostream &os) { // Handle the statistics of the pass. ArrayRef stats = pass.getStatistics(); if (!stats.empty()) { - os << "\n#### Statistics\n```\n"; + os << "\n#### Statistics\n\n```\n"; size_t longestStat = 0; for (const PassStatistic &stat : stats) longestStat = std::max(stat.getName().size(), longestStat); diff --git a/mlir/unittests/Bytecode/BytecodeTest.cpp b/mlir/unittests/Bytecode/BytecodeTest.cpp index cb915a092a0be..c036fe26b1b36 100644 --- a/mlir/unittests/Bytecode/BytecodeTest.cpp +++ b/mlir/unittests/Bytecode/BytecodeTest.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Endian.h" #include "llvm/Support/MemoryBufferRef.h" +#include "llvm/Support/raw_ostream.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -37,6 +38,29 @@ module @TestDialectResources attributes { #-} )"; +struct MockOstream final : public raw_ostream { + std::unique_ptr buffer; + size_t size = 0; + + MOCK_METHOD(void, reserveExtraSpace, (uint64_t extraSpace), (override)); + + MockOstream() : raw_ostream(true) {} + uint64_t current_pos() const override { return pos; } + +private: + size_t pos = 0; + + void write_impl(const char *ptr, size_t length) override { + if (pos + length <= size) { + memcpy((void *)(buffer.get() + pos), ptr, length); + pos += length; + } else { + report_fatal_error( + "Attempted to write past the end of the fixed size buffer."); + } + } +}; + TEST(Bytecode, MultiModuleWithResource) { MLIRContext context; Builder builder(&context); @@ -45,12 +69,17 @@ TEST(Bytecode, MultiModuleWithResource) { parseSourceString(irWithResources, parseConfig); ASSERT_TRUE(module); - // Write the module to bytecode - std::string buffer; - llvm::raw_string_ostream ostream(buffer); + // Write the module to bytecode. + MockOstream ostream; + EXPECT_CALL(ostream, reserveExtraSpace).WillOnce([&](uint64_t space) { + ostream.buffer = std::make_unique(space); + ostream.size = space; + }); ASSERT_TRUE(succeeded(writeBytecodeToFile(module.get(), ostream))); // Create copy of buffer which is aligned to requested resource alignment. + std::string buffer((char *)ostream.buffer.get(), + (char *)ostream.buffer.get() + ostream.size); constexpr size_t kAlignment = 0x20; size_t bufferSize = buffer.size(); buffer.reserve(bufferSize + kAlignment - 1); diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h index 147583c209fc3..db8443a7be933 100644 --- a/offload/include/Shared/Environment.h +++ b/offload/include/Shared/Environment.h @@ -30,6 +30,7 @@ enum class DeviceDebugKind : uint32_t { FunctionTracing = 1U << 1, CommonIssues = 1U << 2, AllocationTracker = 1U << 3, + PGODump = 1U << 4, }; struct DeviceEnvironmentTy { diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index 7c7e6de613c9f..e030ab9e6b61f 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -63,14 +63,22 @@ struct __llvm_profile_data { #include "llvm/ProfileData/InstrProfData.inc" }; +extern "C" { +extern int __attribute__((weak)) __llvm_write_custom_profile( + const char *Target, const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, const char *NamesEnd); +} + /// PGO profiling data extracted from a GPU device struct GPUProfGlobals { - SmallVector NamesData; - SmallVector> Counts; + SmallVector Counts; SmallVector<__llvm_profile_data> Data; + SmallVector NamesData; Triple TargetTriple; void dump() const; + Error write() const; }; /// Subclass of GlobalTy that holds the memory for a global of \p Ty. diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h index 7b031083647aa..ab110dd88315a 100644 --- a/offload/plugins-nextgen/common/include/RPC.h +++ b/offload/plugins-nextgen/common/include/RPC.h @@ -72,6 +72,9 @@ struct RPCServerTy { /// Array of associated devices. These must be alive as long as the server is. std::unique_ptr Devices; + /// Mutex that guards accesses to the buffers and device array. + std::mutex BufferMutex{}; + /// A helper class for running the user thread that handles the RPC interface. /// Because we only need to check the RPC server while any kernels are /// working, we track submission / completion events to allow the thread to @@ -90,6 +93,9 @@ struct RPCServerTy { std::condition_variable CV; std::mutex Mutex; + /// A reference to the main server's mutex. + std::mutex &BufferMutex; + /// A reference to all the RPC interfaces that the server is handling. llvm::ArrayRef Buffers; @@ -98,9 +104,9 @@ struct RPCServerTy { /// Initialize the worker thread to run in the background. ServerThread(void *Buffers[], plugin::GenericDeviceTy *Devices[], - size_t Length) - : Running(false), NumUsers(0), CV(), Mutex(), Buffers(Buffers, Length), - Devices(Devices, Length) {} + size_t Length, std::mutex &BufferMutex) + : Running(false), NumUsers(0), CV(), Mutex(), BufferMutex(BufferMutex), + Buffers(Buffers, Length), Devices(Devices, Length) {} ~ServerThread() { assert(!Running && "Thread not shut down explicitly\n"); } diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index 8854fc52205a7..8783490831e25 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -206,7 +206,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data()); if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal)) return Err; - DeviceProfileData.Counts.push_back(std::move(Counts)); + DeviceProfileData.Counts.append(std::move(Counts)); } else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) { // Read profiling data for this global variable __llvm_profile_data Data{}; @@ -224,15 +224,14 @@ void GPUProfGlobals::dump() const { << "\n"; outs() << "======== Counters =========\n"; - for (const auto &Count : Counts) { - outs() << "["; - for (size_t i = 0; i < Count.size(); i++) { - if (i == 0) - outs() << " "; - outs() << Count[i] << " "; - } - outs() << "]\n"; + for (size_t i = 0; i < Counts.size(); i++) { + if (i > 0 && i % 10 == 0) + outs() << "\n"; + else if (i != 0) + outs() << " "; + outs() << Counts[i]; } + outs() << "\n"; outs() << "========== Data ===========\n"; for (const auto &ProfData : Data) { @@ -264,3 +263,43 @@ void GPUProfGlobals::dump() const { Symtab.dumpNames(outs()); outs() << "===========================\n"; } + +Error GPUProfGlobals::write() const { + if (!__llvm_write_custom_profile) + return Plugin::error("Could not find symbol __llvm_write_custom_profile. " + "The compiler-rt profiling library must be linked for " + "GPU PGO to work."); + + size_t DataSize = Data.size() * sizeof(__llvm_profile_data), + CountsSize = Counts.size() * sizeof(int64_t); + __llvm_profile_data *DataBegin, *DataEnd; + char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd; + + // Initialize array of contiguous data. We need to make sure each section is + // contiguous so that the PGO library can compute deltas properly + SmallVector ContiguousData(NamesData.size() + DataSize + CountsSize); + + // Compute region pointers + DataBegin = (__llvm_profile_data *)(ContiguousData.data() + CountsSize); + DataEnd = + (__llvm_profile_data *)(ContiguousData.data() + CountsSize + DataSize); + CountersBegin = (char *)ContiguousData.data(); + CountersEnd = (char *)(ContiguousData.data() + CountsSize); + NamesBegin = (char *)(ContiguousData.data() + CountsSize + DataSize); + NamesEnd = (char *)(ContiguousData.data() + CountsSize + DataSize + + NamesData.size()); + + // Copy data to contiguous buffer + memcpy(DataBegin, Data.data(), DataSize); + memcpy(CountersBegin, Counts.data(), CountsSize); + memcpy(NamesBegin, NamesData.data(), NamesData.size()); + + // Invoke compiler-rt entrypoint + int result = __llvm_write_custom_profile(TargetTriple.str().c_str(), + DataBegin, DataEnd, CountersBegin, + CountersEnd, NamesBegin, NamesEnd); + if (result != 0) + return Plugin::error("Error writing GPU PGO data to file"); + + return Plugin::success(); +} diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 48c9b671c1a91..bb3fc77258f3e 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -861,8 +861,14 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { if (!ProfOrErr) return ProfOrErr.takeError(); - // TODO: write data to profiling file - ProfOrErr->dump(); + // Dump out profdata + if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) == + uint32_t(DeviceDebugKind::PGODump)) + ProfOrErr->dump(); + + // Write data to profiling file + if (auto Err = ProfOrErr->write()) + return Err; } // Delete the memory manager before deinitializing the device. Otherwise, diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp index 4289f920c0e1e..70f572923d4b1 100644 --- a/offload/plugins-nextgen/common/src/RPC.cpp +++ b/offload/plugins-nextgen/common/src/RPC.cpp @@ -128,6 +128,7 @@ void RPCServerTy::ServerThread::run() { Lock.unlock(); while (NumUsers.load(std::memory_order_relaxed) > 0 && Running.load(std::memory_order_relaxed)) { + std::lock_guard Lock(BufferMutex); for (const auto &[Buffer, Device] : llvm::zip_equal(Buffers, Devices)) { if (!Buffer || !Device) continue; @@ -146,7 +147,7 @@ RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin) Devices(std::make_unique( Plugin.getNumDevices())), Thread(new ServerThread(Buffers.get(), Devices.get(), - Plugin.getNumDevices())) {} + Plugin.getNumDevices(), BufferMutex)) {} llvm::Error RPCServerTy::startThread() { Thread->startThread(); @@ -187,6 +188,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, if (auto Err = Device.dataSubmit(ClientGlobal.getPtr(), &client, sizeof(rpc::Client), nullptr)) return Err; + std::lock_guard Lock(BufferMutex); Buffers[Device.getDeviceId()] = RPCBuffer; Devices[Device.getDeviceId()] = &Device; @@ -194,6 +196,7 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, } Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) { + std::lock_guard Lock(BufferMutex); Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST); Buffers[Device.getDeviceId()] = nullptr; Devices[Device.getDeviceId()] = nullptr; diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 658ae5f9653ba..1e265d2c30904 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -112,8 +112,10 @@ config.available_features.add(config.libomptarget_current_target) if config.libomptarget_has_libc: config.available_features.add('libc') +profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata") if config.libomptarget_test_pgo: config.available_features.add('pgo') + config.substitutions.append(("%profdata", profdata_path)) # Determine whether the test system supports unified memory. # For CUDA, this is the case with compute capability 70 (Volta) or higher. @@ -407,6 +409,8 @@ if config.test_fortran_compiler: config.available_features.add('flang') config.substitutions.append(("%flang", config.test_fortran_compiler)) +config.substitutions.append(("%target_triple", config.libomptarget_current_target)) + config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path: config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path)) diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in index ce3f6abf50a13..00f4e2b74a5b0 100644 --- a/offload/test/lit.site.cfg.in +++ b/offload/test/lit.site.cfg.in @@ -1,6 +1,6 @@ @AUTO_GEN_COMMENT@ -config.bin_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin" +config.bin_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@" config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@" diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index 5dc1e5d95caf3..6fe4487ffb67f 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -1,12 +1,17 @@ -// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ -// RUN: -Xclang "-fprofile-instrument=clang" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ -// RUN: --check-prefix="CLANG-PGO" // RUN: %libomptarget-compile-generic -fprofile-generate \ // RUN: -Xclang "-fprofile-instrument=llvm" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ +// RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1 +// RUN: %profdata show --all-functions --counts \ +// RUN: %target_triple.llvm.profraw | %fcheck-generic \ // RUN: --check-prefix="LLVM-PGO" +// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ +// RUN: -Xclang "-fprofile-instrument=clang" +// RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1 +// RUN: %profdata show --all-functions --counts \ +// RUN: %target_triple.clang.profraw | %fcheck-generic \ +// RUN: --check-prefix="CLANG-PGO" + // REQUIRES: gpu // REQUIRES: pgo @@ -27,48 +32,35 @@ int main() { } } } +// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 4 +// LLVM-PGO: Block counts: [20, 10, 2, 1] + +// LLVM-PGO-LABEL: test1: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 1 +// LLVM-PGO: Block counts: [10] + +// LLVM-PGO-LABEL: test2: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 1 +// LLVM-PGO: Block counts: [20] + +// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 3 +// CLANG-PGO: Function count: 0 +// CLANG-PGO: Block counts: [11, 20] -// CLANG-PGO: ======== Counters ========= -// CLANG-PGO-NEXT: [ 0 11 20 ] -// CLANG-PGO-NEXT: [ 10 ] -// CLANG-PGO-NEXT: [ 20 ] -// CLANG-PGO-NEXT: ========== Data =========== -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{.*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{.*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{.*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: ======== Functions ======== -// CLANG-PGO-NEXT: pgo1.c: -// CLANG-PGO-SAME: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} -// CLANG-PGO-NEXT: test1 -// CLANG-PGO-NEXT: test2 +// CLANG-PGO-LABEL: test1: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: 10 +// CLANG-PGO: Block counts: [] -// LLVM-PGO: ======== Counters ========= -// LLVM-PGO-NEXT: [ 20 10 2 1 ] -// LLVM-PGO-NEXT: [ 10 ] -// LLVM-PGO-NEXT: [ 20 ] -// LLVM-PGO-NEXT: ========== Data =========== -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{.*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{.*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{.*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: ======== Functions ======== -// LLVM-PGO-NEXT: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} -// LLVM-PGO-NEXT: test1 -// LLVM-PGO-NEXT: test2 +// CLANG-PGO-LABEL: test2: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: 20 +// CLANG-PGO: Block counts: [] diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index 951c651f42f29..cd78a5ba88e2c 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1522,3 +1522,4 @@ debugging features are supported. * Enable debugging assertions in the device. ``0x01`` * Enable diagnosing common problems during offloading . ``0x4`` * Enable device malloc statistics (amdgpu only). ``0x8`` + * Dump device PGO counters (only if PGO on GPU is enabled). ``0x10`` diff --git a/openmp/runtime/test/misc_bugs/simd_conservative_ordered.c b/openmp/runtime/test/misc_bugs/simd_conservative_ordered.c new file mode 100644 index 0000000000000..af0fa66eee1d7 --- /dev/null +++ b/openmp/runtime/test/misc_bugs/simd_conservative_ordered.c @@ -0,0 +1,84 @@ +// RUN: %libomp-compile -O3 -ffast-math +// RUN: %libomp-run +#include +#include +#include +#include +#include + +int compare_float(float x1, float x2, float scalar) { + const float diff = fabsf(x1 - x2); + x1 = fabsf(x1); + x2 = fabsf(x2); + const float l = (x2 > x1) ? x2 : x1; + if (diff <= l * scalar * FLT_EPSILON) + return 1; + else + return 0; +} + +#define ARRAY_SIZE 256 + +__attribute__((noinline)) void +initialization_loop(float X[ARRAY_SIZE][ARRAY_SIZE], + float Y[ARRAY_SIZE][ARRAY_SIZE]) { + const float max = 1000.0; + srand(time(NULL)); + for (int r = 0; r < ARRAY_SIZE; r++) { + for (int c = 0; c < ARRAY_SIZE; c++) { + X[r][c] = ((float)rand() / (float)(RAND_MAX)) * max; + Y[r][c] = X[r][c]; + } + } +} + +__attribute__((noinline)) void omp_simd_loop(float X[ARRAY_SIZE][ARRAY_SIZE]) { + for (int r = 1; r < ARRAY_SIZE; ++r) { + for (int c = 1; c < ARRAY_SIZE; ++c) { +#pragma omp simd + for (int k = 2; k < ARRAY_SIZE; ++k) { +#pragma omp ordered simd + X[r][k] = X[r][k - 2] + sinf((float)(r / c)); + } + } + } +} + +__attribute__((noinline)) int comparison_loop(float X[ARRAY_SIZE][ARRAY_SIZE], + float Y[ARRAY_SIZE][ARRAY_SIZE]) { + int totalErrors_simd = 0; + const float scalar = 1.0; + for (int r = 1; r < ARRAY_SIZE; ++r) { + for (int c = 1; c < ARRAY_SIZE; ++c) { + for (int k = 2; k < ARRAY_SIZE; ++k) { + Y[r][k] = Y[r][k - 2] + sinf((float)(r / c)); + } + } + // check row for simd update + for (int k = 0; k < ARRAY_SIZE; ++k) { + if (!compare_float(X[r][k], Y[r][k], scalar)) { + ++totalErrors_simd; + } + } + } + return totalErrors_simd; +} + +int main(void) { + float X[ARRAY_SIZE][ARRAY_SIZE]; + float Y[ARRAY_SIZE][ARRAY_SIZE]; + + initialization_loop(X, Y); + omp_simd_loop(X); + const int totalErrors_simd = comparison_loop(X, Y); + + if (totalErrors_simd) { + fprintf(stdout, "totalErrors_simd: %d \n", totalErrors_simd); + fprintf(stdout, "%s : %d - FAIL: error in ordered simd computation.\n", + __FILE__, __LINE__); + } else { + fprintf(stdout, "Success!\n"); + } + + return totalErrors_simd; +} diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h index 9a5261bf2f642..93695f8e26d27 100644 --- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h +++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h @@ -296,6 +296,8 @@ /* HAVE_PROC_PID_RUSAGE defined in Bazel */ +#define HAVE_GETAUXVAL 1 + /* Directly provide definitions here behind platform preprocessor definitions. * The preprocessor conditions are sufficient to handle all of the configuration * on platforms targeted by Bazel, and defining these here more faithfully diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel index 61f4700b057ab..561f2b8f408f0 100644 --- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel @@ -224,8 +224,6 @@ cc_test( # Skip a test that relies on reading files in a way that doesn't easily # work with Bazel. "--gtest_filter=-NativeSymbolReuseTest.*", - # TODO: this test is failing on some configs, investigate and re-enable it. - "--gtest_filter=-DebugLineBasicFixture.LookupAddressRangeWithStmtSequenceOffset", ], features = ["-layering_check"], # #include "../lib/CodeGen/AsmPrinter/DwarfStringPool.h" deps = [ diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index e07891f004850..7cf259a1b58cd 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -4259,6 +4259,7 @@ cc_library( ":ArithDialect", ":ArithUtils", ":FuncDialect", + ":FunctionInterfaces", ":IR", ":MemRefDialect", ":Pass", @@ -9298,6 +9299,7 @@ cc_library( ":OpenMPDialect", ":Support", ":TransformUtils", + "//llvm:Analysis", "//llvm:Core", "//llvm:FrontendOpenMP", "//llvm:Support", @@ -13269,6 +13271,8 @@ cc_library( ":FuncTransforms", ":IR", ":MemRefDialect", + ":MeshDialect", + ":MeshShardingInterface", ":Pass", ":SideEffectInterfaces", ":Support", diff --git a/utils/bazel/llvm-project-overlay/mlir/test/mlir-tblgen/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/mlir-tblgen/BUILD.bazel index 7fe1fdc8c6936..1dd418c75984e 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/mlir-tblgen/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/mlir-tblgen/BUILD.bazel @@ -34,6 +34,7 @@ package(default_visibility = ["//visibility:public"]) "//mlir:include/mlir/IR/OpBase.td", "//mlir:include/mlir/Interfaces/InferTypeOpInterface.td", "//mlir:include/mlir/Interfaces/SideEffectInterfaces.td", + "//mlir:include/mlir/Pass/PassBase.td", "//mlir:mlir-opt", "//mlir:mlir-tblgen", "//mlir/test:lit_data", diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake index f6f10ea4f4f83..835201f2a45b0 100644 --- a/utils/bazel/llvm_configs/config.h.cmake +++ b/utils/bazel/llvm_configs/config.h.cmake @@ -295,4 +295,6 @@ #cmakedefine HAVE_BUILTIN_THREAD_POINTER ${HAVE_BUILTIN_THREAD_POINTER} +#cmakedefine HAVE_GETAUXVAL ${HAVE_GETAUXVAL} + #endif diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake index 629977cc11d68..239f9dd3f38db 100644 --- a/utils/bazel/llvm_configs/llvm-config.h.cmake +++ b/utils/bazel/llvm_configs/llvm-config.h.cmake @@ -201,4 +201,7 @@ /* Define if logf128 is available */ #cmakedefine LLVM_HAS_LOGF128 +/* Define if building LLVM with LLVM_BUILD_TELEMETRY */ +#cmakedefine LLVM_BUILD_TELEMETRY ${LLVM_BUILD_TELEMETRY} + #endif